npm - webpeel - Versions diffs - 0.21.57 → 0.21.59 - Mend

webpeel 0.21.57 → 0.21.59

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/core/pipeline.d.ts +3 -0
package/dist/core/pipeline.js +45 -0
package/dist/core/safe-browsing.d.ts +22 -0
package/dist/core/safe-browsing.js +183 -0
package/dist/index.d.ts +3 -0
package/dist/index.js +25 -3
package/dist/server/pg-auth-store.js +1 -1
package/dist/types.d.ts +26 -0
package/package.json +1 -1

package/dist/core/pipeline.d.ts CHANGED Viewed

@@ -13,6 +13,7 @@ import type { PeelOptions, PeelResult, ImageInfo } from '../types.js';
 import type { BrandingProfile } from './branding.js';
 import type { ChangeResult } from './change-tracking.js';
 import type { DesignAnalysis } from './design-analysis.js';
+import type { SafeBrowsingResult } from './safe-browsing.js';
 /** Mutable context threaded through pipeline stages */
 export interface PipelineContext {
     url: string;
@@ -81,6 +82,8 @@ export interface PipelineContext {
     warnings: string[];
     /** Raw HTML size in characters (measured from fetched content before any conversion) */
     rawHtmlSize?: number;
+    /** Safe Browsing check result (set early in pipeline, before fetch) */
+    safeBrowsingResult?: SafeBrowsingResult;
 }
 /** Create the initial PipelineContext with defaults */
 export declare function createContext(url: string, options: PeelOptions): PipelineContext;

package/dist/core/pipeline.js CHANGED Viewed

@@ -20,6 +20,8 @@ import { quickAnswer as runQuickAnswer } from './quick-answer.js';
 import { Timer } from './timing.js';
 import { chunkContent } from './chunker.js';
 import { BlockedError } from '../types.js';
+import { sanitizeForLLM } from './prompt-guard.js';
+import { getSourceCredibility } from './source-credibility.js';
 import { createLogger } from './logger.js';
 const log = createLogger('pipeline');
 /** Create the initial PipelineContext with defaults */
@@ -1245,6 +1247,48 @@ export async function finalize(ctx) {
 export function buildResult(ctx) {
     const fetchResult = ctx.fetchResult;
     const elapsed = Date.now() - ctx.startTime;
+    // --- Trust & Safety ---
+    // Run prompt injection scan on final content
+    const sanitizeResult = sanitizeForLLM(ctx.content);
+    // If injection was detected, use the cleaned content
+    if (sanitizeResult.injectionDetected) {
+        ctx.content = sanitizeResult.content;
+        ctx.warnings.push('Prompt injection patterns detected and stripped from content.');
+    }
+    // Assess source credibility
+    const credibility = getSourceCredibility(ctx.url);
+    // Compute composite trust score
+    let trustScore = 1.0;
+    if (credibility.tier === 'general')
+        trustScore -= 0.2;
+    if (sanitizeResult.injectionDetected)
+        trustScore -= 0.5;
+    if ((ctx.quality ?? 1.0) < 0.5)
+        trustScore -= 0.1;
+    trustScore = Math.max(0, Math.min(1, trustScore));
+    // Build trust warnings
+    const trustWarnings = [];
+    if (credibility.tier === 'general')
+        trustWarnings.push('Source is unverified (not a known official or trusted domain).');
+    if (sanitizeResult.injectionDetected)
+        trustWarnings.push(`Prompt injection detected: ${sanitizeResult.detectedPatterns.join(', ')}`);
+    if (sanitizeResult.strippedChars > 0)
+        trustWarnings.push(`Stripped ${sanitizeResult.strippedChars} suspicious characters (zero-width/Unicode smuggling).`);
+    const trust = {
+        source: {
+            tier: credibility.tier,
+            stars: credibility.stars,
+            label: credibility.label,
+        },
+        contentSafety: {
+            clean: !sanitizeResult.injectionDetected,
+            injectionDetected: sanitizeResult.injectionDetected,
+            detectedPatterns: sanitizeResult.detectedPatterns,
+            strippedCount: sanitizeResult.strippedChars,
+        },
+        score: trustScore,
+        warnings: trustWarnings,
+    };
     const tokens = estimateTokens(ctx.content);
     const fingerprint = createHash('sha256').update(ctx.content).digest('hex').slice(0, 16);
     // Token savings metrics — only when raw HTML size was captured (from actual fetch or domain extractor)
@@ -1342,5 +1386,6 @@ export function buildResult(ctx) {
         ...(rawTokenEstimate !== undefined ? { rawTokenEstimate } : {}),
         ...(tokenSavingsPercent !== undefined ? { tokenSavingsPercent } : {}),
         ...(fetchResult.autoInteract !== undefined ? { autoInteract: fetchResult.autoInteract } : {}),
+        trust,
     };
 }

package/dist/core/safe-browsing.d.ts ADDED Viewed

@@ -0,0 +1,22 @@
+/**
+ * Domain safety check using Google Safe Browsing Lookup API v4.
+ * Free: 10,000 lookups/day.
+ * Falls back to a local blocklist when no API key is configured.
+ */
+export interface SafeBrowsingResult {
+    safe: boolean;
+    threats: string[];
+    source: 'google-api' | 'local-blocklist' | 'unchecked';
+}
+/**
+ * Check URL safety.
+ *
+ * Flow:
+ * 1. If SAFE_BROWSING_API_KEY (or passed apiKey) is set, race Google API vs 2s timeout.
+ *    Falls back to local blocklist on timeout or error.
+ * 2. Without an API key, use local heuristic blocklist only.
+ *
+ * @param url    The URL to check
+ * @param apiKey Google Safe Browsing API key (optional). Falls back to SAFE_BROWSING_API_KEY env var.
+ */
+export declare function checkUrlSafety(url: string, apiKey?: string): Promise<SafeBrowsingResult>;

package/dist/core/safe-browsing.js ADDED Viewed

@@ -0,0 +1,183 @@
+/**
+ * Domain safety check using Google Safe Browsing Lookup API v4.
+ * Free: 10,000 lookups/day.
+ * Falls back to a local blocklist when no API key is configured.
+ */
+// Known brands commonly impersonated in phishing
+const KNOWN_BRANDS = [
+    'amazon', 'google', 'facebook', 'apple', 'microsoft', 'paypal', 'netflix',
+    'instagram', 'twitter', 'linkedin', 'dropbox', 'chase', 'wellsfargo', 'bankofamerica',
+    'citibank', 'hsbc', 'ebay', 'walmart', 'target', 'bestbuy', 'fedex', 'ups', 'usps',
+    'irs', 'dmv', 'gov', 'yahoo', 'outlook', 'hotmail',
+];
+// TLDs heavily abused for phishing/malware (free-domain registrars)
+const SUSPICIOUS_TLDS = new Set(['.tk', '.ml', '.ga', '.cf', '.gq', '.top', '.click', '.loan', '.win', '.xyz', '.club', '.work']);
+// Private/reserved IPv4 ranges (safe for local dev)
+const PRIVATE_IP_RANGES = [
+    /^127\.\d+\.\d+\.\d+$/, // loopback
+    /^10\.\d+\.\d+\.\d+$/, // RFC 1918
+    /^192\.168\.\d+\.\d+$/, // RFC 1918
+    /^172\.(1[6-9]|2\d|3[01])\.\d+\.\d+$/, // RFC 1918
+    /^169\.254\.\d+\.\d+$/, // link-local
+    /^::1$/, // IPv6 loopback
+    /^fc00:/, // IPv6 private
+    /^fd[0-9a-f]{2}:/i, // IPv6 ULA
+];
+function isPrivateIp(host) {
+    return PRIVATE_IP_RANGES.some((re) => re.test(host));
+}
+function isIpAddress(host) {
+    // IPv4
+    if (/^\d{1,3}(\.\d{1,3}){3}$/.test(host))
+        return true;
+    // IPv6 (bare or bracketed)
+    if (/^\[?[0-9a-fA-F:]+\]?$/.test(host))
+        return true;
+    return false;
+}
+/**
+ * Local heuristic blocklist — catches common attack patterns without an API key.
+ */
+function checkLocalBlocklist(url) {
+    const threats = [];
+    // 1. Data URIs — always suspicious
+    if (/^data:/i.test(url.trim())) {
+        threats.push('DATA_URI');
+        return { safe: false, threats, source: 'local-blocklist' };
+    }
+    let parsed = null;
+    try {
+        parsed = new URL(url);
+    }
+    catch {
+        // Unparseable URL — flag as suspicious
+        threats.push('INVALID_URL');
+        return { safe: false, threats, source: 'local-blocklist' };
+    }
+    const { hostname, username, password } = parsed;
+    // 2. @ sign trick: http://google.com@evil.com/login → username = 'google.com'
+    if (username || password) {
+        threats.push('URL_CREDENTIALS_TRICK');
+        return { safe: false, threats, source: 'local-blocklist' };
+    }
+    // 3. Punycode homograph attacks (xn-- internationalized domains)
+    if (/\bxn--/i.test(hostname)) {
+        // Allow legitimate IDN TLDs (e.g. .xn--p1ai = .рф)
+        const parts = hostname.split('.');
+        const hasPunycodeLabel = parts.slice(0, -1).some((p) => /^xn--/i.test(p));
+        if (hasPunycodeLabel) {
+            threats.push('PUNYCODE_HOMOGRAPH');
+        }
+    }
+    // 4. IP-only URLs pointing to non-private ranges
+    if (isIpAddress(hostname)) {
+        const bare = hostname.replace(/^\[|\]$/g, ''); // strip brackets from IPv6
+        if (!isPrivateIp(bare)) {
+            threats.push('SUSPICIOUS_IP');
+        }
+        if (threats.length > 0)
+            return { safe: false, threats, source: 'local-blocklist' };
+        return { safe: true, threats: [], source: 'local-blocklist' };
+    }
+    const lowerHost = hostname.toLowerCase();
+    // Remove www prefix for analysis
+    const hostNoWww = lowerHost.replace(/^www\./, '');
+    const parts = hostNoWww.split('.');
+    const tld = parts.length >= 2 ? '.' + parts[parts.length - 1] : '';
+    const sld = parts.length >= 2 ? parts[parts.length - 2] : '';
+    // 5. Known-bad TLDs combined with brand names (amazon-login.tk)
+    if (SUSPICIOUS_TLDS.has(tld)) {
+        const containsBrand = KNOWN_BRANDS.some((brand) => hostNoWww.includes(brand));
+        if (containsBrand) {
+            threats.push('PHISHING');
+        }
+    }
+    // 6. Excessive hyphens in SLD (amaz0n-login-verify-account.com)
+    const hyphenCount = (sld.match(/-/g) || []).length;
+    if (hyphenCount >= 3) {
+        threats.push('EXCESSIVE_HYPHENS');
+    }
+    // 7. Brand name in subdomain combined with suspicious TLD
+    if (SUSPICIOUS_TLDS.has(tld)) {
+        const subdomains = parts.slice(0, -2).join('.');
+        const subHasBrand = KNOWN_BRANDS.some((brand) => subdomains.includes(brand));
+        if (subHasBrand && !threats.includes('PHISHING')) {
+            threats.push('PHISHING');
+        }
+    }
+    // 8. Excessive subdomains: login.secure.verify.account.bank.xyz.com
+    if (parts.length > 5) {
+        threats.push('EXCESSIVE_SUBDOMAINS');
+    }
+    if (threats.length > 0) {
+        return { safe: false, threats, source: 'local-blocklist' };
+    }
+    return { safe: true, threats: [], source: 'local-blocklist' };
+}
+/**
+ * Check a URL against the Google Safe Browsing Lookup API v4.
+ * Returns null on any error (network timeout, bad key, etc.) so caller can fall back.
+ */
+async function checkGoogleSafeBrowsing(url, apiKey) {
+    const endpoint = `https://safebrowsing.googleapis.com/v4/threatMatches:find?key=${encodeURIComponent(apiKey)}`;
+    const body = {
+        client: { clientId: 'webpeel', clientVersion: '1.0.0' },
+        threatInfo: {
+            threatTypes: ['MALWARE', 'SOCIAL_ENGINEERING', 'UNWANTED_SOFTWARE', 'POTENTIALLY_HARMFUL_APPLICATION'],
+            platformTypes: ['ANY_PLATFORM'],
+            threatEntryTypes: ['URL'],
+            threatEntries: [{ url }],
+        },
+    };
+    const controller = new AbortController();
+    const timeoutId = setTimeout(() => controller.abort(), 2000);
+    try {
+        const resp = await fetch(endpoint, {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify(body),
+            signal: controller.signal,
+        });
+        clearTimeout(timeoutId);
+        if (!resp.ok)
+            return null;
+        const data = await resp.json();
+        if (!data.matches || data.matches.length === 0) {
+            return { safe: true, threats: [], source: 'google-api' };
+        }
+        const threats = [...new Set(data.matches.map((m) => m.threatType))];
+        return { safe: false, threats, source: 'google-api' };
+    }
+    catch {
+        clearTimeout(timeoutId);
+        return null;
+    }
+}
+/**
+ * Check URL safety.
+ *
+ * Flow:
+ * 1. If SAFE_BROWSING_API_KEY (or passed apiKey) is set, race Google API vs 2s timeout.
+ *    Falls back to local blocklist on timeout or error.
+ * 2. Without an API key, use local heuristic blocklist only.
+ *
+ * @param url    The URL to check
+ * @param apiKey Google Safe Browsing API key (optional). Falls back to SAFE_BROWSING_API_KEY env var.
+ */
+export async function checkUrlSafety(url, apiKey) {
+    const key = apiKey ?? process.env.SAFE_BROWSING_API_KEY;
+    if (key) {
+        // Race: Google API with 2s timeout, fallback to local
+        const timeoutResult = checkLocalBlocklist(url);
+        const googleResult = await Promise.race([
+            checkGoogleSafeBrowsing(url, key),
+            new Promise((resolve) => setTimeout(() => resolve(null), 2000)),
+        ]);
+        if (googleResult !== null)
+            return googleResult;
+        // API timed out or errored — use local blocklist result
+        return timeoutResult;
+    }
+    // No API key — local blocklist only
+    return checkLocalBlocklist(url);
+}

package/dist/index.d.ts CHANGED Viewed

@@ -42,6 +42,9 @@ export type SearchFallbackResult = {
 };
 export declare function searchFallback(..._args: any[]): Promise<SearchFallbackResult | null>;
 export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS, type PeelTLSOptions, type PeelTLSResult } from './core/peel-tls.js';
+export { sanitizeForLLM, type SanitizeResult } from './core/prompt-guard.js';
+export { getSourceCredibility, type SourceCredibility } from './core/source-credibility.js';
+export { checkUrlSafety, type SafeBrowsingResult } from './core/safe-browsing.js';
 /**
  * Fetch and extract content from a URL
  *

package/dist/index.js CHANGED Viewed

@@ -5,6 +5,7 @@
  */
 import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from './core/fetcher.js';
 import { createContext, normalizeOptions, handleYouTube, fetchContent, detectContentType, parseContent, postProcess, finalize, buildResult, } from './core/pipeline.js';
+import { checkUrlSafety } from './core/safe-browsing.js';
 export * from './types.js';
 export { getDomainExtractor, extractDomainData } from './core/domain-extractors.js';
 export { crawl } from './core/crawler.js';
@@ -47,6 +48,9 @@ export async function searchFallback(..._args) {
     }
 }
 export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS } from './core/peel-tls.js';
+export { sanitizeForLLM } from './core/prompt-guard.js';
+export { getSourceCredibility } from './core/source-credibility.js';
+export { checkUrlSafety } from './core/safe-browsing.js';
 /**
  * Fetch and extract content from a URL
  *
@@ -66,16 +70,34 @@ export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS } from './core/peel-t
 export async function peel(url, options = {}) {
     const ctx = createContext(url, options);
     normalizeOptions(ctx);
+    // Safe Browsing check — runs before any HTTP request, non-blocking
+    const sbResult = await checkUrlSafety(url, process.env.SAFE_BROWSING_API_KEY);
+    ctx.safeBrowsingResult = sbResult;
+    if (!sbResult.safe) {
+        const threatList = sbResult.threats.join(', ');
+        ctx.warnings.push(`⚠️ URL flagged by Safe Browsing: ${threatList}`);
+    }
     const ytResult = await handleYouTube(ctx);
-    if (ytResult)
-        return ytResult;
+    if (ytResult) {
+        // Attach safe browsing to YouTube results too
+        return {
+            ...ytResult,
+            safeBrowsing: sbResult,
+            ...(ytResult.warnings || ctx.warnings.length > 0
+                ? { warnings: [...(ytResult.warnings ?? []), ...ctx.warnings.filter(w => !ytResult.warnings?.includes(w))] }
+                : {}),
+        };
+    }
     try {
         await fetchContent(ctx);
         detectContentType(ctx);
         await parseContent(ctx);
         await postProcess(ctx);
         await finalize(ctx);
-        return buildResult(ctx);
+        const result = buildResult(ctx);
+        // Attach safe browsing result
+        result.safeBrowsing = sbResult;
+        return result;
     }
     catch (error) {
         // Clean up browser resources on error

package/dist/server/pg-auth-store.js CHANGED Viewed

@@ -52,7 +52,7 @@ export class PostgresAuthStore {
         title TEXT,
         content TEXT NOT NULL,
         tokens INTEGER,
-        created_by TEXT REFERENCES users(id),
+        created_by TEXT,
         created_at TIMESTAMPTZ DEFAULT NOW(),
         expires_at TIMESTAMPTZ DEFAULT NOW() + INTERVAL '30 days',
         view_count INTEGER DEFAULT 0

package/dist/types.d.ts CHANGED Viewed

@@ -339,6 +339,26 @@ export interface PeelResult {
     rawTokenEstimate?: number;
     /** Token savings percentage compared to raw HTML (how much cheaper WebPeel is) */
     tokenSavingsPercent?: number;
+    /** Trust & safety assessment of the fetched content */
+    trust?: {
+        /** Source credibility tier */
+        source: {
+            tier: 'official' | 'verified' | 'general';
+            stars: number;
+            label: string;
+        };
+        /** Prompt injection scan result */
+        contentSafety: {
+            clean: boolean;
+            injectionDetected: boolean;
+            detectedPatterns: string[];
+            strippedCount: number;
+        };
+        /** Overall trust score 0-1 (composite of source + content safety) */
+        score: number;
+        /** Human-readable safety warnings */
+        warnings: string[];
+    };
     /** Content chunks (when chunk option is enabled) */
     chunks?: Array<{
         index: number;
@@ -350,6 +370,12 @@ export interface PeelResult {
         startOffset: number;
         endOffset: number;
     }>;
+    /** Safe Browsing check result */
+    safeBrowsing?: {
+        safe: boolean;
+        threats: string[];
+        source: 'google-api' | 'local-blocklist' | 'unchecked';
+    };
 }
 export interface PageMetadata {
     /** Meta description */

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.57",
+  "version": "0.21.59",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",