webpeel 0.21.57 → 0.21.59

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@ import type { PeelOptions, PeelResult, ImageInfo } from '../types.js';
13
13
  import type { BrandingProfile } from './branding.js';
14
14
  import type { ChangeResult } from './change-tracking.js';
15
15
  import type { DesignAnalysis } from './design-analysis.js';
16
+ import type { SafeBrowsingResult } from './safe-browsing.js';
16
17
  /** Mutable context threaded through pipeline stages */
17
18
  export interface PipelineContext {
18
19
  url: string;
@@ -81,6 +82,8 @@ export interface PipelineContext {
81
82
  warnings: string[];
82
83
  /** Raw HTML size in characters (measured from fetched content before any conversion) */
83
84
  rawHtmlSize?: number;
85
+ /** Safe Browsing check result (set early in pipeline, before fetch) */
86
+ safeBrowsingResult?: SafeBrowsingResult;
84
87
  }
85
88
  /** Create the initial PipelineContext with defaults */
86
89
  export declare function createContext(url: string, options: PeelOptions): PipelineContext;
@@ -20,6 +20,8 @@ import { quickAnswer as runQuickAnswer } from './quick-answer.js';
20
20
  import { Timer } from './timing.js';
21
21
  import { chunkContent } from './chunker.js';
22
22
  import { BlockedError } from '../types.js';
23
+ import { sanitizeForLLM } from './prompt-guard.js';
24
+ import { getSourceCredibility } from './source-credibility.js';
23
25
  import { createLogger } from './logger.js';
24
26
  const log = createLogger('pipeline');
25
27
  /** Create the initial PipelineContext with defaults */
@@ -1245,6 +1247,48 @@ export async function finalize(ctx) {
1245
1247
  export function buildResult(ctx) {
1246
1248
  const fetchResult = ctx.fetchResult;
1247
1249
  const elapsed = Date.now() - ctx.startTime;
1250
+ // --- Trust & Safety ---
1251
+ // Run prompt injection scan on final content
1252
+ const sanitizeResult = sanitizeForLLM(ctx.content);
1253
+ // If injection was detected, use the cleaned content
1254
+ if (sanitizeResult.injectionDetected) {
1255
+ ctx.content = sanitizeResult.content;
1256
+ ctx.warnings.push('Prompt injection patterns detected and stripped from content.');
1257
+ }
1258
+ // Assess source credibility
1259
+ const credibility = getSourceCredibility(ctx.url);
1260
+ // Compute composite trust score
1261
+ let trustScore = 1.0;
1262
+ if (credibility.tier === 'general')
1263
+ trustScore -= 0.2;
1264
+ if (sanitizeResult.injectionDetected)
1265
+ trustScore -= 0.5;
1266
+ if ((ctx.quality ?? 1.0) < 0.5)
1267
+ trustScore -= 0.1;
1268
+ trustScore = Math.max(0, Math.min(1, trustScore));
1269
+ // Build trust warnings
1270
+ const trustWarnings = [];
1271
+ if (credibility.tier === 'general')
1272
+ trustWarnings.push('Source is unverified (not a known official or trusted domain).');
1273
+ if (sanitizeResult.injectionDetected)
1274
+ trustWarnings.push(`Prompt injection detected: ${sanitizeResult.detectedPatterns.join(', ')}`);
1275
+ if (sanitizeResult.strippedChars > 0)
1276
+ trustWarnings.push(`Stripped ${sanitizeResult.strippedChars} suspicious characters (zero-width/Unicode smuggling).`);
1277
+ const trust = {
1278
+ source: {
1279
+ tier: credibility.tier,
1280
+ stars: credibility.stars,
1281
+ label: credibility.label,
1282
+ },
1283
+ contentSafety: {
1284
+ clean: !sanitizeResult.injectionDetected,
1285
+ injectionDetected: sanitizeResult.injectionDetected,
1286
+ detectedPatterns: sanitizeResult.detectedPatterns,
1287
+ strippedCount: sanitizeResult.strippedChars,
1288
+ },
1289
+ score: trustScore,
1290
+ warnings: trustWarnings,
1291
+ };
1248
1292
  const tokens = estimateTokens(ctx.content);
1249
1293
  const fingerprint = createHash('sha256').update(ctx.content).digest('hex').slice(0, 16);
1250
1294
  // Token savings metrics — only when raw HTML size was captured (from actual fetch or domain extractor)
@@ -1342,5 +1386,6 @@ export function buildResult(ctx) {
1342
1386
  ...(rawTokenEstimate !== undefined ? { rawTokenEstimate } : {}),
1343
1387
  ...(tokenSavingsPercent !== undefined ? { tokenSavingsPercent } : {}),
1344
1388
  ...(fetchResult.autoInteract !== undefined ? { autoInteract: fetchResult.autoInteract } : {}),
1389
+ trust,
1345
1390
  };
1346
1391
  }
@@ -0,0 +1,22 @@
1
+ /**
2
+ * Domain safety check using Google Safe Browsing Lookup API v4.
3
+ * Free: 10,000 lookups/day.
4
+ * Falls back to a local blocklist when no API key is configured.
5
+ */
6
+ export interface SafeBrowsingResult {
7
+ safe: boolean;
8
+ threats: string[];
9
+ source: 'google-api' | 'local-blocklist' | 'unchecked';
10
+ }
11
+ /**
12
+ * Check URL safety.
13
+ *
14
+ * Flow:
15
+ * 1. If SAFE_BROWSING_API_KEY (or passed apiKey) is set, race Google API vs 2s timeout.
16
+ * Falls back to local blocklist on timeout or error.
17
+ * 2. Without an API key, use local heuristic blocklist only.
18
+ *
19
+ * @param url The URL to check
20
+ * @param apiKey Google Safe Browsing API key (optional). Falls back to SAFE_BROWSING_API_KEY env var.
21
+ */
22
+ export declare function checkUrlSafety(url: string, apiKey?: string): Promise<SafeBrowsingResult>;
@@ -0,0 +1,183 @@
1
+ /**
2
+ * Domain safety check using Google Safe Browsing Lookup API v4.
3
+ * Free: 10,000 lookups/day.
4
+ * Falls back to a local blocklist when no API key is configured.
5
+ */
6
+ // Known brands commonly impersonated in phishing
7
+ const KNOWN_BRANDS = [
8
+ 'amazon', 'google', 'facebook', 'apple', 'microsoft', 'paypal', 'netflix',
9
+ 'instagram', 'twitter', 'linkedin', 'dropbox', 'chase', 'wellsfargo', 'bankofamerica',
10
+ 'citibank', 'hsbc', 'ebay', 'walmart', 'target', 'bestbuy', 'fedex', 'ups', 'usps',
11
+ 'irs', 'dmv', 'gov', 'yahoo', 'outlook', 'hotmail',
12
+ ];
13
+ // TLDs heavily abused for phishing/malware (free-domain registrars)
14
+ const SUSPICIOUS_TLDS = new Set(['.tk', '.ml', '.ga', '.cf', '.gq', '.top', '.click', '.loan', '.win', '.xyz', '.club', '.work']);
15
+ // Private/reserved IPv4 ranges (safe for local dev)
16
+ const PRIVATE_IP_RANGES = [
17
+ /^127\.\d+\.\d+\.\d+$/, // loopback
18
+ /^10\.\d+\.\d+\.\d+$/, // RFC 1918
19
+ /^192\.168\.\d+\.\d+$/, // RFC 1918
20
+ /^172\.(1[6-9]|2\d|3[01])\.\d+\.\d+$/, // RFC 1918
21
+ /^169\.254\.\d+\.\d+$/, // link-local
22
+ /^::1$/, // IPv6 loopback
23
+ /^fc00:/, // IPv6 private
24
+ /^fd[0-9a-f]{2}:/i, // IPv6 ULA
25
+ ];
26
+ function isPrivateIp(host) {
27
+ return PRIVATE_IP_RANGES.some((re) => re.test(host));
28
+ }
29
+ function isIpAddress(host) {
30
+ // IPv4
31
+ if (/^\d{1,3}(\.\d{1,3}){3}$/.test(host))
32
+ return true;
33
+ // IPv6 (bare or bracketed)
34
+ if (/^\[?[0-9a-fA-F:]+\]?$/.test(host))
35
+ return true;
36
+ return false;
37
+ }
38
+ /**
39
+ * Local heuristic blocklist — catches common attack patterns without an API key.
40
+ */
41
+ function checkLocalBlocklist(url) {
42
+ const threats = [];
43
+ // 1. Data URIs — always suspicious
44
+ if (/^data:/i.test(url.trim())) {
45
+ threats.push('DATA_URI');
46
+ return { safe: false, threats, source: 'local-blocklist' };
47
+ }
48
+ let parsed = null;
49
+ try {
50
+ parsed = new URL(url);
51
+ }
52
+ catch {
53
+ // Unparseable URL — flag as suspicious
54
+ threats.push('INVALID_URL');
55
+ return { safe: false, threats, source: 'local-blocklist' };
56
+ }
57
+ const { hostname, username, password } = parsed;
58
+ // 2. @ sign trick: http://google.com@evil.com/login → username = 'google.com'
59
+ if (username || password) {
60
+ threats.push('URL_CREDENTIALS_TRICK');
61
+ return { safe: false, threats, source: 'local-blocklist' };
62
+ }
63
+ // 3. Punycode homograph attacks (xn-- internationalized domains)
64
+ if (/\bxn--/i.test(hostname)) {
65
+ // Allow legitimate IDN TLDs (e.g. .xn--p1ai = .рф)
66
+ const parts = hostname.split('.');
67
+ const hasPunycodeLabel = parts.slice(0, -1).some((p) => /^xn--/i.test(p));
68
+ if (hasPunycodeLabel) {
69
+ threats.push('PUNYCODE_HOMOGRAPH');
70
+ }
71
+ }
72
+ // 4. IP-only URLs pointing to non-private ranges
73
+ if (isIpAddress(hostname)) {
74
+ const bare = hostname.replace(/^\[|\]$/g, ''); // strip brackets from IPv6
75
+ if (!isPrivateIp(bare)) {
76
+ threats.push('SUSPICIOUS_IP');
77
+ }
78
+ if (threats.length > 0)
79
+ return { safe: false, threats, source: 'local-blocklist' };
80
+ return { safe: true, threats: [], source: 'local-blocklist' };
81
+ }
82
+ const lowerHost = hostname.toLowerCase();
83
+ // Remove www prefix for analysis
84
+ const hostNoWww = lowerHost.replace(/^www\./, '');
85
+ const parts = hostNoWww.split('.');
86
+ const tld = parts.length >= 2 ? '.' + parts[parts.length - 1] : '';
87
+ const sld = parts.length >= 2 ? parts[parts.length - 2] : '';
88
+ // 5. Known-bad TLDs combined with brand names (amazon-login.tk)
89
+ if (SUSPICIOUS_TLDS.has(tld)) {
90
+ const containsBrand = KNOWN_BRANDS.some((brand) => hostNoWww.includes(brand));
91
+ if (containsBrand) {
92
+ threats.push('PHISHING');
93
+ }
94
+ }
95
+ // 6. Excessive hyphens in SLD (amaz0n-login-verify-account.com)
96
+ const hyphenCount = (sld.match(/-/g) || []).length;
97
+ if (hyphenCount >= 3) {
98
+ threats.push('EXCESSIVE_HYPHENS');
99
+ }
100
+ // 7. Brand name in subdomain combined with suspicious TLD
101
+ if (SUSPICIOUS_TLDS.has(tld)) {
102
+ const subdomains = parts.slice(0, -2).join('.');
103
+ const subHasBrand = KNOWN_BRANDS.some((brand) => subdomains.includes(brand));
104
+ if (subHasBrand && !threats.includes('PHISHING')) {
105
+ threats.push('PHISHING');
106
+ }
107
+ }
108
+ // 8. Excessive subdomains: login.secure.verify.account.bank.xyz.com
109
+ if (parts.length > 5) {
110
+ threats.push('EXCESSIVE_SUBDOMAINS');
111
+ }
112
+ if (threats.length > 0) {
113
+ return { safe: false, threats, source: 'local-blocklist' };
114
+ }
115
+ return { safe: true, threats: [], source: 'local-blocklist' };
116
+ }
117
+ /**
118
+ * Check a URL against the Google Safe Browsing Lookup API v4.
119
+ * Returns null on any error (network timeout, bad key, etc.) so caller can fall back.
120
+ */
121
+ async function checkGoogleSafeBrowsing(url, apiKey) {
122
+ const endpoint = `https://safebrowsing.googleapis.com/v4/threatMatches:find?key=${encodeURIComponent(apiKey)}`;
123
+ const body = {
124
+ client: { clientId: 'webpeel', clientVersion: '1.0.0' },
125
+ threatInfo: {
126
+ threatTypes: ['MALWARE', 'SOCIAL_ENGINEERING', 'UNWANTED_SOFTWARE', 'POTENTIALLY_HARMFUL_APPLICATION'],
127
+ platformTypes: ['ANY_PLATFORM'],
128
+ threatEntryTypes: ['URL'],
129
+ threatEntries: [{ url }],
130
+ },
131
+ };
132
+ const controller = new AbortController();
133
+ const timeoutId = setTimeout(() => controller.abort(), 2000);
134
+ try {
135
+ const resp = await fetch(endpoint, {
136
+ method: 'POST',
137
+ headers: { 'Content-Type': 'application/json' },
138
+ body: JSON.stringify(body),
139
+ signal: controller.signal,
140
+ });
141
+ clearTimeout(timeoutId);
142
+ if (!resp.ok)
143
+ return null;
144
+ const data = await resp.json();
145
+ if (!data.matches || data.matches.length === 0) {
146
+ return { safe: true, threats: [], source: 'google-api' };
147
+ }
148
+ const threats = [...new Set(data.matches.map((m) => m.threatType))];
149
+ return { safe: false, threats, source: 'google-api' };
150
+ }
151
+ catch {
152
+ clearTimeout(timeoutId);
153
+ return null;
154
+ }
155
+ }
156
+ /**
157
+ * Check URL safety.
158
+ *
159
+ * Flow:
160
+ * 1. If SAFE_BROWSING_API_KEY (or passed apiKey) is set, race Google API vs 2s timeout.
161
+ * Falls back to local blocklist on timeout or error.
162
+ * 2. Without an API key, use local heuristic blocklist only.
163
+ *
164
+ * @param url The URL to check
165
+ * @param apiKey Google Safe Browsing API key (optional). Falls back to SAFE_BROWSING_API_KEY env var.
166
+ */
167
+ export async function checkUrlSafety(url, apiKey) {
168
+ const key = apiKey ?? process.env.SAFE_BROWSING_API_KEY;
169
+ if (key) {
170
+ // Race: Google API with 2s timeout, fallback to local
171
+ const timeoutResult = checkLocalBlocklist(url);
172
+ const googleResult = await Promise.race([
173
+ checkGoogleSafeBrowsing(url, key),
174
+ new Promise((resolve) => setTimeout(() => resolve(null), 2000)),
175
+ ]);
176
+ if (googleResult !== null)
177
+ return googleResult;
178
+ // API timed out or errored — use local blocklist result
179
+ return timeoutResult;
180
+ }
181
+ // No API key — local blocklist only
182
+ return checkLocalBlocklist(url);
183
+ }
package/dist/index.d.ts CHANGED
@@ -42,6 +42,9 @@ export type SearchFallbackResult = {
42
42
  };
43
43
  export declare function searchFallback(..._args: any[]): Promise<SearchFallbackResult | null>;
44
44
  export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS, type PeelTLSOptions, type PeelTLSResult } from './core/peel-tls.js';
45
+ export { sanitizeForLLM, type SanitizeResult } from './core/prompt-guard.js';
46
+ export { getSourceCredibility, type SourceCredibility } from './core/source-credibility.js';
47
+ export { checkUrlSafety, type SafeBrowsingResult } from './core/safe-browsing.js';
45
48
  /**
46
49
  * Fetch and extract content from a URL
47
50
  *
package/dist/index.js CHANGED
@@ -5,6 +5,7 @@
5
5
  */
6
6
  import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from './core/fetcher.js';
7
7
  import { createContext, normalizeOptions, handleYouTube, fetchContent, detectContentType, parseContent, postProcess, finalize, buildResult, } from './core/pipeline.js';
8
+ import { checkUrlSafety } from './core/safe-browsing.js';
8
9
  export * from './types.js';
9
10
  export { getDomainExtractor, extractDomainData } from './core/domain-extractors.js';
10
11
  export { crawl } from './core/crawler.js';
@@ -47,6 +48,9 @@ export async function searchFallback(..._args) {
47
48
  }
48
49
  }
49
50
  export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS } from './core/peel-tls.js';
51
+ export { sanitizeForLLM } from './core/prompt-guard.js';
52
+ export { getSourceCredibility } from './core/source-credibility.js';
53
+ export { checkUrlSafety } from './core/safe-browsing.js';
50
54
  /**
51
55
  * Fetch and extract content from a URL
52
56
  *
@@ -66,16 +70,34 @@ export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS } from './core/peel-t
66
70
  export async function peel(url, options = {}) {
67
71
  const ctx = createContext(url, options);
68
72
  normalizeOptions(ctx);
73
+ // Safe Browsing check — runs before any HTTP request, non-blocking
74
+ const sbResult = await checkUrlSafety(url, process.env.SAFE_BROWSING_API_KEY);
75
+ ctx.safeBrowsingResult = sbResult;
76
+ if (!sbResult.safe) {
77
+ const threatList = sbResult.threats.join(', ');
78
+ ctx.warnings.push(`⚠️ URL flagged by Safe Browsing: ${threatList}`);
79
+ }
69
80
  const ytResult = await handleYouTube(ctx);
70
- if (ytResult)
71
- return ytResult;
81
+ if (ytResult) {
82
+ // Attach safe browsing to YouTube results too
83
+ return {
84
+ ...ytResult,
85
+ safeBrowsing: sbResult,
86
+ ...(ytResult.warnings || ctx.warnings.length > 0
87
+ ? { warnings: [...(ytResult.warnings ?? []), ...ctx.warnings.filter(w => !ytResult.warnings?.includes(w))] }
88
+ : {}),
89
+ };
90
+ }
72
91
  try {
73
92
  await fetchContent(ctx);
74
93
  detectContentType(ctx);
75
94
  await parseContent(ctx);
76
95
  await postProcess(ctx);
77
96
  await finalize(ctx);
78
- return buildResult(ctx);
97
+ const result = buildResult(ctx);
98
+ // Attach safe browsing result
99
+ result.safeBrowsing = sbResult;
100
+ return result;
79
101
  }
80
102
  catch (error) {
81
103
  // Clean up browser resources on error
@@ -52,7 +52,7 @@ export class PostgresAuthStore {
52
52
  title TEXT,
53
53
  content TEXT NOT NULL,
54
54
  tokens INTEGER,
55
- created_by TEXT REFERENCES users(id),
55
+ created_by TEXT,
56
56
  created_at TIMESTAMPTZ DEFAULT NOW(),
57
57
  expires_at TIMESTAMPTZ DEFAULT NOW() + INTERVAL '30 days',
58
58
  view_count INTEGER DEFAULT 0
package/dist/types.d.ts CHANGED
@@ -339,6 +339,26 @@ export interface PeelResult {
339
339
  rawTokenEstimate?: number;
340
340
  /** Token savings percentage compared to raw HTML (how much cheaper WebPeel is) */
341
341
  tokenSavingsPercent?: number;
342
+ /** Trust & safety assessment of the fetched content */
343
+ trust?: {
344
+ /** Source credibility tier */
345
+ source: {
346
+ tier: 'official' | 'verified' | 'general';
347
+ stars: number;
348
+ label: string;
349
+ };
350
+ /** Prompt injection scan result */
351
+ contentSafety: {
352
+ clean: boolean;
353
+ injectionDetected: boolean;
354
+ detectedPatterns: string[];
355
+ strippedCount: number;
356
+ };
357
+ /** Overall trust score 0-1 (composite of source + content safety) */
358
+ score: number;
359
+ /** Human-readable safety warnings */
360
+ warnings: string[];
361
+ };
342
362
  /** Content chunks (when chunk option is enabled) */
343
363
  chunks?: Array<{
344
364
  index: number;
@@ -350,6 +370,12 @@ export interface PeelResult {
350
370
  startOffset: number;
351
371
  endOffset: number;
352
372
  }>;
373
+ /** Safe Browsing check result */
374
+ safeBrowsing?: {
375
+ safe: boolean;
376
+ threats: string[];
377
+ source: 'google-api' | 'local-blocklist' | 'unchecked';
378
+ };
353
379
  }
354
380
  export interface PageMetadata {
355
381
  /** Meta description */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.57",
3
+ "version": "0.21.59",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",