webpeel 0.21.58 → 0.21.60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@ import type { PeelOptions, PeelResult, ImageInfo } from '../types.js';
13
13
  import type { BrandingProfile } from './branding.js';
14
14
  import type { ChangeResult } from './change-tracking.js';
15
15
  import type { DesignAnalysis } from './design-analysis.js';
16
+ import type { SafeBrowsingResult } from './safe-browsing.js';
16
17
  /** Mutable context threaded through pipeline stages */
17
18
  export interface PipelineContext {
18
19
  url: string;
@@ -81,6 +82,8 @@ export interface PipelineContext {
81
82
  warnings: string[];
82
83
  /** Raw HTML size in characters (measured from fetched content before any conversion) */
83
84
  rawHtmlSize?: number;
85
+ /** Safe Browsing check result (set early in pipeline, before fetch) */
86
+ safeBrowsingResult?: SafeBrowsingResult;
84
87
  }
85
88
  /** Create the initial PipelineContext with defaults */
86
89
  export declare function createContext(url: string, options: PeelOptions): PipelineContext;
@@ -20,6 +20,8 @@ import { quickAnswer as runQuickAnswer } from './quick-answer.js';
20
20
  import { Timer } from './timing.js';
21
21
  import { chunkContent } from './chunker.js';
22
22
  import { BlockedError } from '../types.js';
23
+ import { sanitizeForLLM } from './prompt-guard.js';
24
+ import { getSourceCredibility } from './source-credibility.js';
23
25
  import { createLogger } from './logger.js';
24
26
  const log = createLogger('pipeline');
25
27
  /** Create the initial PipelineContext with defaults */
@@ -1245,6 +1247,50 @@ export async function finalize(ctx) {
1245
1247
  export function buildResult(ctx) {
1246
1248
  const fetchResult = ctx.fetchResult;
1247
1249
  const elapsed = Date.now() - ctx.startTime;
1250
+ // --- Trust & Safety ---
1251
+ // Run prompt injection scan on final content
1252
+ const sanitizeResult = sanitizeForLLM(ctx.content);
1253
+ // If injection was detected, use the cleaned content
1254
+ if (sanitizeResult.injectionDetected) {
1255
+ ctx.content = sanitizeResult.content;
1256
+ ctx.warnings.push('Prompt injection patterns detected and stripped from content.');
1257
+ }
1258
+ // Assess source credibility
1259
+ const credibility = getSourceCredibility(ctx.url);
1260
+ // Compute composite trust score from source credibility (0-100) + content safety
1261
+ let trustScore = credibility.score / 100; // normalize 0-100 → 0-1
1262
+ if (sanitizeResult.injectionDetected)
1263
+ trustScore -= 0.3;
1264
+ if ((ctx.quality ?? 1.0) < 0.5)
1265
+ trustScore -= 0.1;
1266
+ trustScore = Math.round(Math.max(0, Math.min(1, trustScore)) * 100) / 100;
1267
+ // Build trust warnings
1268
+ const trustWarnings = [...(credibility.warnings ?? [])];
1269
+ if (credibility.tier === 'new')
1270
+ trustWarnings.push('Domain has limited verifiable presence — exercise caution.');
1271
+ if (credibility.tier === 'suspicious')
1272
+ trustWarnings.push('Domain shows suspicious signals — treat content with caution.');
1273
+ if (sanitizeResult.injectionDetected)
1274
+ trustWarnings.push(`Prompt injection detected: ${sanitizeResult.detectedPatterns.join(', ')}`);
1275
+ if (sanitizeResult.strippedChars > 0)
1276
+ trustWarnings.push(`Stripped ${sanitizeResult.strippedChars} suspicious characters (zero-width/Unicode smuggling).`);
1277
+ const trust = {
1278
+ source: {
1279
+ tier: credibility.tier,
1280
+ score: credibility.score,
1281
+ label: credibility.label,
1282
+ signals: credibility.signals,
1283
+ warnings: credibility.warnings,
1284
+ },
1285
+ contentSafety: {
1286
+ clean: !sanitizeResult.injectionDetected,
1287
+ injectionDetected: sanitizeResult.injectionDetected,
1288
+ detectedPatterns: sanitizeResult.detectedPatterns,
1289
+ strippedCount: sanitizeResult.strippedChars,
1290
+ },
1291
+ score: trustScore,
1292
+ warnings: trustWarnings,
1293
+ };
1248
1294
  const tokens = estimateTokens(ctx.content);
1249
1295
  const fingerprint = createHash('sha256').update(ctx.content).digest('hex').slice(0, 16);
1250
1296
  // Token savings metrics — only when raw HTML size was captured (from actual fetch or domain extractor)
@@ -1342,5 +1388,6 @@ export function buildResult(ctx) {
1342
1388
  ...(rawTokenEstimate !== undefined ? { rawTokenEstimate } : {}),
1343
1389
  ...(tokenSavingsPercent !== undefined ? { tokenSavingsPercent } : {}),
1344
1390
  ...(fetchResult.autoInteract !== undefined ? { autoInteract: fetchResult.autoInteract } : {}),
1391
+ trust,
1345
1392
  };
1346
1393
  }
@@ -0,0 +1,22 @@
1
+ /**
2
+ * Domain safety check using Google Safe Browsing Lookup API v4.
3
+ * Free: 10,000 lookups/day.
4
+ * Falls back to a local blocklist when no API key is configured.
5
+ */
6
+ export interface SafeBrowsingResult {
7
+ safe: boolean;
8
+ threats: string[];
9
+ source: 'google-api' | 'local-blocklist' | 'unchecked';
10
+ }
11
+ /**
12
+ * Check URL safety.
13
+ *
14
+ * Flow:
15
+ * 1. If SAFE_BROWSING_API_KEY (or passed apiKey) is set, race Google API vs 2s timeout.
16
+ * Falls back to local blocklist on timeout or error.
17
+ * 2. Without an API key, use local heuristic blocklist only.
18
+ *
19
+ * @param url The URL to check
20
+ * @param apiKey Google Safe Browsing API key (optional). Falls back to SAFE_BROWSING_API_KEY env var.
21
+ */
22
+ export declare function checkUrlSafety(url: string, apiKey?: string): Promise<SafeBrowsingResult>;
@@ -0,0 +1,183 @@
1
+ /**
2
+ * Domain safety check using Google Safe Browsing Lookup API v4.
3
+ * Free: 10,000 lookups/day.
4
+ * Falls back to a local blocklist when no API key is configured.
5
+ */
6
+ // Known brands commonly impersonated in phishing
7
+ const KNOWN_BRANDS = [
8
+ 'amazon', 'google', 'facebook', 'apple', 'microsoft', 'paypal', 'netflix',
9
+ 'instagram', 'twitter', 'linkedin', 'dropbox', 'chase', 'wellsfargo', 'bankofamerica',
10
+ 'citibank', 'hsbc', 'ebay', 'walmart', 'target', 'bestbuy', 'fedex', 'ups', 'usps',
11
+ 'irs', 'dmv', 'gov', 'yahoo', 'outlook', 'hotmail',
12
+ ];
13
+ // TLDs heavily abused for phishing/malware (free-domain registrars)
14
+ const SUSPICIOUS_TLDS = new Set(['.tk', '.ml', '.ga', '.cf', '.gq', '.top', '.click', '.loan', '.win', '.xyz', '.club', '.work']);
15
+ // Private/reserved IPv4 ranges (safe for local dev)
16
+ const PRIVATE_IP_RANGES = [
17
+ /^127\.\d+\.\d+\.\d+$/, // loopback
18
+ /^10\.\d+\.\d+\.\d+$/, // RFC 1918
19
+ /^192\.168\.\d+\.\d+$/, // RFC 1918
20
+ /^172\.(1[6-9]|2\d|3[01])\.\d+\.\d+$/, // RFC 1918
21
+ /^169\.254\.\d+\.\d+$/, // link-local
22
+ /^::1$/, // IPv6 loopback
23
+ /^fc00:/, // IPv6 private
24
+ /^fd[0-9a-f]{2}:/i, // IPv6 ULA
25
+ ];
26
+ function isPrivateIp(host) {
27
+ return PRIVATE_IP_RANGES.some((re) => re.test(host));
28
+ }
29
+ function isIpAddress(host) {
30
+ // IPv4
31
+ if (/^\d{1,3}(\.\d{1,3}){3}$/.test(host))
32
+ return true;
33
+ // IPv6 (bare or bracketed)
34
+ if (/^\[?[0-9a-fA-F:]+\]?$/.test(host))
35
+ return true;
36
+ return false;
37
+ }
38
+ /**
39
+ * Local heuristic blocklist — catches common attack patterns without an API key.
40
+ */
41
+ function checkLocalBlocklist(url) {
42
+ const threats = [];
43
+ // 1. Data URIs — always suspicious
44
+ if (/^data:/i.test(url.trim())) {
45
+ threats.push('DATA_URI');
46
+ return { safe: false, threats, source: 'local-blocklist' };
47
+ }
48
+ let parsed = null;
49
+ try {
50
+ parsed = new URL(url);
51
+ }
52
+ catch {
53
+ // Unparseable URL — flag as suspicious
54
+ threats.push('INVALID_URL');
55
+ return { safe: false, threats, source: 'local-blocklist' };
56
+ }
57
+ const { hostname, username, password } = parsed;
58
+ // 2. @ sign trick: http://google.com@evil.com/login → username = 'google.com'
59
+ if (username || password) {
60
+ threats.push('URL_CREDENTIALS_TRICK');
61
+ return { safe: false, threats, source: 'local-blocklist' };
62
+ }
63
+ // 3. Punycode homograph attacks (xn-- internationalized domains)
64
+ if (/\bxn--/i.test(hostname)) {
65
+ // Allow legitimate IDN TLDs (e.g. .xn--p1ai = .рф)
66
+ const parts = hostname.split('.');
67
+ const hasPunycodeLabel = parts.slice(0, -1).some((p) => /^xn--/i.test(p));
68
+ if (hasPunycodeLabel) {
69
+ threats.push('PUNYCODE_HOMOGRAPH');
70
+ }
71
+ }
72
+ // 4. IP-only URLs pointing to non-private ranges
73
+ if (isIpAddress(hostname)) {
74
+ const bare = hostname.replace(/^\[|\]$/g, ''); // strip brackets from IPv6
75
+ if (!isPrivateIp(bare)) {
76
+ threats.push('SUSPICIOUS_IP');
77
+ }
78
+ if (threats.length > 0)
79
+ return { safe: false, threats, source: 'local-blocklist' };
80
+ return { safe: true, threats: [], source: 'local-blocklist' };
81
+ }
82
+ const lowerHost = hostname.toLowerCase();
83
+ // Remove www prefix for analysis
84
+ const hostNoWww = lowerHost.replace(/^www\./, '');
85
+ const parts = hostNoWww.split('.');
86
+ const tld = parts.length >= 2 ? '.' + parts[parts.length - 1] : '';
87
+ const sld = parts.length >= 2 ? parts[parts.length - 2] : '';
88
+ // 5. Known-bad TLDs combined with brand names (amazon-login.tk)
89
+ if (SUSPICIOUS_TLDS.has(tld)) {
90
+ const containsBrand = KNOWN_BRANDS.some((brand) => hostNoWww.includes(brand));
91
+ if (containsBrand) {
92
+ threats.push('PHISHING');
93
+ }
94
+ }
95
+ // 6. Excessive hyphens in SLD (amaz0n-login-verify-account.com)
96
+ const hyphenCount = (sld.match(/-/g) || []).length;
97
+ if (hyphenCount >= 3) {
98
+ threats.push('EXCESSIVE_HYPHENS');
99
+ }
100
+ // 7. Brand name in subdomain combined with suspicious TLD
101
+ if (SUSPICIOUS_TLDS.has(tld)) {
102
+ const subdomains = parts.slice(0, -2).join('.');
103
+ const subHasBrand = KNOWN_BRANDS.some((brand) => subdomains.includes(brand));
104
+ if (subHasBrand && !threats.includes('PHISHING')) {
105
+ threats.push('PHISHING');
106
+ }
107
+ }
108
+ // 8. Excessive subdomains: login.secure.verify.account.bank.xyz.com
109
+ if (parts.length > 5) {
110
+ threats.push('EXCESSIVE_SUBDOMAINS');
111
+ }
112
+ if (threats.length > 0) {
113
+ return { safe: false, threats, source: 'local-blocklist' };
114
+ }
115
+ return { safe: true, threats: [], source: 'local-blocklist' };
116
+ }
117
+ /**
118
+ * Check a URL against the Google Safe Browsing Lookup API v4.
119
+ * Returns null on any error (network timeout, bad key, etc.) so caller can fall back.
120
+ */
121
+ async function checkGoogleSafeBrowsing(url, apiKey) {
122
+ const endpoint = `https://safebrowsing.googleapis.com/v4/threatMatches:find?key=${encodeURIComponent(apiKey)}`;
123
+ const body = {
124
+ client: { clientId: 'webpeel', clientVersion: '1.0.0' },
125
+ threatInfo: {
126
+ threatTypes: ['MALWARE', 'SOCIAL_ENGINEERING', 'UNWANTED_SOFTWARE', 'POTENTIALLY_HARMFUL_APPLICATION'],
127
+ platformTypes: ['ANY_PLATFORM'],
128
+ threatEntryTypes: ['URL'],
129
+ threatEntries: [{ url }],
130
+ },
131
+ };
132
+ const controller = new AbortController();
133
+ const timeoutId = setTimeout(() => controller.abort(), 2000);
134
+ try {
135
+ const resp = await fetch(endpoint, {
136
+ method: 'POST',
137
+ headers: { 'Content-Type': 'application/json' },
138
+ body: JSON.stringify(body),
139
+ signal: controller.signal,
140
+ });
141
+ clearTimeout(timeoutId);
142
+ if (!resp.ok)
143
+ return null;
144
+ const data = await resp.json();
145
+ if (!data.matches || data.matches.length === 0) {
146
+ return { safe: true, threats: [], source: 'google-api' };
147
+ }
148
+ const threats = [...new Set(data.matches.map((m) => m.threatType))];
149
+ return { safe: false, threats, source: 'google-api' };
150
+ }
151
+ catch {
152
+ clearTimeout(timeoutId);
153
+ return null;
154
+ }
155
+ }
156
+ /**
157
+ * Check URL safety.
158
+ *
159
+ * Flow:
160
+ * 1. If SAFE_BROWSING_API_KEY (or passed apiKey) is set, race Google API vs 2s timeout.
161
+ * Falls back to local blocklist on timeout or error.
162
+ * 2. Without an API key, use local heuristic blocklist only.
163
+ *
164
+ * @param url The URL to check
165
+ * @param apiKey Google Safe Browsing API key (optional). Falls back to SAFE_BROWSING_API_KEY env var.
166
+ */
167
+ export async function checkUrlSafety(url, apiKey) {
168
+ const key = apiKey ?? process.env.SAFE_BROWSING_API_KEY;
169
+ if (key) {
170
+ // Race: Google API with 2s timeout, fallback to local
171
+ const timeoutResult = checkLocalBlocklist(url);
172
+ const googleResult = await Promise.race([
173
+ checkGoogleSafeBrowsing(url, key),
174
+ new Promise((resolve) => setTimeout(() => resolve(null), 2000)),
175
+ ]);
176
+ if (googleResult !== null)
177
+ return googleResult;
178
+ // API timed out or errored — use local blocklist result
179
+ return timeoutResult;
180
+ }
181
+ // No API key — local blocklist only
182
+ return checkLocalBlocklist(url);
183
+ }
@@ -1,17 +1,29 @@
1
1
  /**
2
- * Source credibility scoring — lightweight, zero dependencies.
2
+ * Source credibility scoring — lightweight, zero dependencies, no network calls.
3
3
  *
4
- * Classifies URLs by trustworthiness:
5
- * - Official (★★★): .gov, .edu, .mil, WHO, NIH, academic journals
6
- * - Verified (★★): Wikipedia, Reuters, BBC, GitHub, StackOverflow
7
- * - General (★): Everything else
4
+ * Actively investigates domain signals from the URL itself:
5
+ * - TLD trust score
6
+ * - HTTPS enforcement
7
+ * - Domain structure analysis
8
+ * - Brand/platform recognition (500+ known domains)
9
+ * - Content platform detection
10
+ *
11
+ * Score breakdown (0–100):
12
+ * TLD weight 0–20
13
+ * HTTPS 0–10
14
+ * Known domain 0–40
15
+ * Structure 0–15
16
+ * Platform 0–15
8
17
  */
9
18
  export interface SourceCredibility {
10
- tier: 'official' | 'verified' | 'general';
11
- stars: number;
19
+ tier: 'official' | 'established' | 'community' | 'new' | 'suspicious';
20
+ score: number;
12
21
  label: string;
22
+ signals: string[];
23
+ warnings: string[];
13
24
  }
14
25
  /**
15
26
  * Assess the credibility of a source URL.
27
+ * Fully synchronous — no network calls.
16
28
  */
17
29
  export declare function getSourceCredibility(url: string): SourceCredibility;
@@ -1,83 +1,584 @@
1
1
  /**
2
- * Source credibility scoring — lightweight, zero dependencies.
2
+ * Source credibility scoring — lightweight, zero dependencies, no network calls.
3
3
  *
4
- * Classifies URLs by trustworthiness:
5
- * - Official (★★★): .gov, .edu, .mil, WHO, NIH, academic journals
6
- * - Verified (★★): Wikipedia, Reuters, BBC, GitHub, StackOverflow
7
- * - General (★): Everything else
4
+ * Actively investigates domain signals from the URL itself:
5
+ * - TLD trust score
6
+ * - HTTPS enforcement
7
+ * - Domain structure analysis
8
+ * - Brand/platform recognition (500+ known domains)
9
+ * - Content platform detection
10
+ *
11
+ * Score breakdown (0–100):
12
+ * TLD weight 0–20
13
+ * HTTPS 0–10
14
+ * Known domain 0–40
15
+ * Structure 0–15
16
+ * Platform 0–15
8
17
  */
9
- /** Official TLDs and hostnames that indicate high-authority sources */
10
- const OFFICIAL_TLDS = new Set(['.gov', '.edu', '.mil']);
11
- const OFFICIAL_HOSTNAMES = new Set([
12
- // Academic / research
13
- 'arxiv.org', 'scholar.google.com', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
14
- 'jstor.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org', 'bmj.com',
15
- 'thelancet.com', 'plos.org', 'springer.com', 'elsevier.com',
18
+ // ---------------------------------------------------------------------------
19
+ // TLD trust map: points (0–20)
20
+ // ---------------------------------------------------------------------------
21
+ const TLD_TRUST = {
22
+ '.gov': 20, '.edu': 20, '.mil': 20,
23
+ '.org': 14, '.net': 12, '.com': 12, '.io': 11,
24
+ '.co': 10, '.us': 10, '.uk': 10, '.ca': 10, '.au': 10,
25
+ '.de': 10, '.fr': 10, '.jp': 10, '.br': 10, '.in': 10,
26
+ '.eu': 11, '.int': 15,
27
+ '.info': 8, '.biz': 7, '.me': 8, '.tv': 8, '.app': 10,
28
+ '.dev': 10, '.ai': 10, '.tech': 8, '.page': 8,
29
+ '.blog': 7, '.news': 8, '.media': 8, '.press': 8,
30
+ '.shop': 7, '.store': 7, '.online': 7, '.site': 6,
31
+ '.website': 6, '.space': 5, '.club': 5, '.pro': 7,
32
+ // Low-trust freebies
33
+ '.tk': 1, '.ml': 1, '.ga': 1, '.cf': 1, '.gq': 1,
34
+ '.xyz': 4, '.top': 3, '.loan': 2, '.click': 3, '.link': 4,
35
+ '.win': 2, '.bid': 2, '.download': 2, '.racing': 2, '.review': 4,
36
+ '.cc': 3, '.pw': 3, '.men': 2, '.party': 2, '.stream': 3,
37
+ };
38
+ // ---------------------------------------------------------------------------
39
+ // Suspicious TLDs (high-risk freebies used in phishing)
40
+ // ---------------------------------------------------------------------------
41
+ const SUSPICIOUS_TLDS = new Set(['.tk', '.ml', '.ga', '.cf', '.gq', '.win', '.bid', '.men', '.party', '.loan']);
42
+ // ---------------------------------------------------------------------------
43
+ // Official TLDs
44
+ // ---------------------------------------------------------------------------
45
+ const OFFICIAL_TLDS = new Set(['.gov', '.edu', '.mil', '.int']);
46
+ // ---------------------------------------------------------------------------
47
+ // Official hostnames (beyond .gov/.edu/.mil TLD)
48
+ // ---------------------------------------------------------------------------
49
+ const OFFICIAL_DOMAINS = new Set([
16
50
  // International organisations
17
51
  'who.int', 'un.org', 'worldbank.org', 'imf.org', 'oecd.org', 'europa.eu',
52
+ 'nato.int', 'wto.org', 'unicef.org', 'unhcr.org', 'icrc.org',
53
+ // Academic / research
54
+ 'arxiv.org', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov', 'jstor.org',
55
+ 'nature.com', 'science.org', 'cell.com', 'nejm.org', 'bmj.com',
56
+ 'thelancet.com', 'plos.org', 'springer.com', 'elsevier.com',
57
+ 'scholar.google.com', 'researchgate.net', 'semanticscholar.org',
58
+ 'acm.org', 'ieee.org',
18
59
  // Official tech documentation
19
60
  'docs.python.org', 'developer.mozilla.org', 'nodejs.org', 'rust-lang.org',
20
61
  'docs.microsoft.com', 'learn.microsoft.com', 'developer.apple.com',
21
62
  'developer.android.com', 'php.net', 'ruby-lang.org', 'golang.org', 'go.dev',
22
- // Health / medicine
23
- 'cdc.gov', 'nih.gov', 'fda.gov', 'mayoclinic.org', 'clevelandclinic.org',
24
- 'webmd.com', 'medlineplus.gov',
63
+ // Health
64
+ 'mayoclinic.org', 'clevelandclinic.org', 'webmd.com',
25
65
  // Standards / specs
26
- 'w3.org', 'ietf.org', 'rfc-editor.org', 'iso.org',
66
+ 'w3.org', 'ietf.org', 'rfc-editor.org', 'iso.org', 'ecma-international.org',
67
+ ]);
68
+ // ---------------------------------------------------------------------------
69
+ // Established domains (score bonus 40 pts) — 500+ entries
70
+ // ---------------------------------------------------------------------------
71
+ const ESTABLISHED_DOMAINS = new Set([
72
+ // ── Major Tech ──────────────────────────────────────────────────────────
73
+ 'google.com', 'apple.com', 'microsoft.com', 'amazon.com', 'meta.com',
74
+ 'netflix.com', 'spotify.com', 'adobe.com', 'salesforce.com', 'oracle.com',
75
+ 'ibm.com', 'intel.com', 'nvidia.com', 'amd.com', 'qualcomm.com',
76
+ 'cisco.com', 'vmware.com', 'sap.com', 'servicenow.com', 'workday.com',
77
+ 'zoom.us', 'slack.com', 'dropbox.com', 'box.com', 'atlassian.com',
78
+ 'jira.atlassian.com', 'confluence.atlassian.com',
79
+ 'twilio.com', 'sendgrid.com', 'mailchimp.com', 'hubspot.com',
80
+ 'zendesk.com', 'intercom.com', 'freshworks.com', 'docusign.com',
81
+ 'okta.com', 'auth0.com', 'cloudflare.com', 'fastly.com', 'akamai.com',
82
+ 'digitalocean.com', 'linode.com', 'vultr.com',
83
+ 'datadog.com', 'newrelic.com', 'splunk.com', 'elastic.co',
84
+ 'mongodb.com', 'redis.io', 'postgresql.org', 'mysql.com',
85
+ 'docker.com', 'kubernetes.io', 'helm.sh',
86
+ 'terraform.io', 'ansible.com', 'chef.io', 'puppet.com',
87
+ 'heroku.com', 'render.com', 'railway.app', 'fly.io',
88
+ 'supabase.com', 'planetscale.com', 'neon.tech', 'fauna.com',
89
+ 'firebase.google.com', 'expo.dev',
90
+ 'openai.com', 'anthropic.com', 'cohere.com', 'huggingface.co',
91
+ 'stability.ai', 'midjourney.com', 'replicate.com',
92
+ 'figma.com', 'sketch.com', 'invisionapp.com', 'zeplin.io',
93
+ 'notion.so', 'airtable.com', 'monday.com', 'asana.com', 'clickup.com',
94
+ 'trello.com', 'basecamp.com', 'linear.app', 'shortcut.com',
95
+ 'postman.com', 'insomnia.rest', 'swagger.io',
96
+ 'sentry.io', 'bugsnag.com', 'rollbar.com',
97
+ 'segment.com', 'mixpanel.com', 'amplitude.com', 'heap.io',
98
+ 'looker.com', 'tableau.com', 'powerbi.microsoft.com',
99
+ 'snowflake.com', 'databricks.com', 'dbt.com', 'fivetran.com', 'airbyte.com',
100
+ 'vercel.com', 'netlify.com',
101
+ // ── Cloud / Hosting ──────────────────────────────────────────────────────
102
+ 'aws.amazon.com', 'cloud.google.com', 'azure.microsoft.com',
103
+ 'docs.aws.amazon.com', 'console.aws.amazon.com',
104
+ // ── Developer Ecosystems ──────────────────────────────────────────────────
105
+ 'github.com', 'gitlab.com', 'bitbucket.org', 'sourcehut.com',
106
+ 'stackoverflow.com', 'superuser.com', 'serverfault.com',
107
+ 'npmjs.com', 'pypi.org', 'crates.io', 'packagist.org', 'rubygems.org',
108
+ 'nuget.org', 'pub.dev', 'hex.pm', 'opam.ocaml.org',
109
+ 'docs.rs', 'crates.io', 'pkg.go.dev',
110
+ 'codepen.io', 'jsfiddle.net', 'replit.com', 'glitch.com', 'codesandbox.io',
111
+ 'leetcode.com', 'hackerrank.com', 'codewars.com', 'exercism.org',
112
+ 'regex101.com', 'regexr.com',
113
+ // ── Major Social ──────────────────────────────────────────────────────────
114
+ 'twitter.com', 'x.com', 'reddit.com', 'linkedin.com', 'instagram.com',
115
+ 'facebook.com', 'youtube.com', 'tiktok.com', 'snapchat.com', 'pinterest.com',
116
+ 'tumblr.com', 'mastodon.social', 'threads.net', 'discord.com', 'discord.gg',
117
+ 'twitch.tv', 'kick.com', 'vimeo.com', 'dailymotion.com',
118
+ 'quora.com', 'medium.com', 'substack.com', 'hashnode.com', 'dev.to',
119
+ // ── Major News ────────────────────────────────────────────────────────────
120
+ 'nytimes.com', 'washingtonpost.com', 'theguardian.com', 'bbc.com', 'bbc.co.uk',
121
+ 'reuters.com', 'apnews.com', 'bloomberg.com', 'economist.com', 'ft.com',
122
+ 'wsj.com', 'cnn.com', 'foxnews.com', 'msnbc.com', 'nbcnews.com',
123
+ 'cbsnews.com', 'abcnews.go.com', 'npr.org', 'pbs.org',
124
+ 'time.com', 'usatoday.com', 'huffpost.com', 'vox.com', 'axios.com',
125
+ 'politico.com', 'thehill.com', 'rollcall.com', 'slate.com', 'salon.com',
126
+ 'theatlantic.com', 'newyorker.com', 'newrepublic.com',
127
+ 'motherjones.com', 'propublica.org', 'intercept.co',
128
+ 'aljazeera.com', 'dw.com', 'france24.com', 'rt.com',
129
+ 'spiegel.de', 'lemonde.fr', 'liberation.fr', 'lefigaro.fr',
130
+ 'elpais.com', 'elmundo.es', 'repubblica.it', 'corriere.it',
131
+ 'theglobeandmail.com', 'thestar.com', 'nationalpost.com',
132
+ 'smh.com.au', 'theage.com.au', 'abc.net.au',
133
+ 'timesofindia.com', 'hindustantimes.com', 'thehindu.com', 'ndtv.com',
134
+ 'scmp.com', 'channelnewsasia.com', 'straitstimes.com',
135
+ 'haaretz.com', 'timesofisrael.com', 'jpost.com',
136
+ 'techcrunch.com', 'wired.com', 'arstechnica.com', 'theverge.com',
137
+ 'engadget.com', 'gizmodo.com', 'cnet.com', 'pcmag.com', 'tomshardware.com',
138
+ 'anandtech.com', 'macrumors.com', '9to5mac.com', '9to5google.com',
139
+ 'androidcentral.com', 'windowscentral.com',
140
+ 'venturebeat.com', 'businessinsider.com', 'forbes.com', 'fortune.com',
141
+ 'inc.com', 'entrepreneur.com', 'fastcompany.com',
142
+ // ── Finance ───────────────────────────────────────────────────────────────
143
+ 'chase.com', 'bankofamerica.com', 'wellsfargo.com', 'citibank.com',
144
+ 'capitalone.com', 'usbank.com', 'tdbank.com', 'pnc.com',
145
+ 'americanexpress.com', 'discover.com', 'synchrony.com',
146
+ 'paypal.com', 'stripe.com', 'square.com', 'braintree.com', 'adyen.com',
147
+ 'coinbase.com', 'binance.com', 'kraken.com', 'gemini.com', 'crypto.com',
148
+ 'robinhood.com', 'etrade.com', 'schwab.com', 'fidelity.com',
149
+ 'vanguard.com', 'blackrock.com', 'jpmorgan.com', 'goldmansachs.com',
150
+ 'morganstanley.com', 'ubs.com', 'credit-suisse.com', 'hsbc.com',
151
+ 'barclays.com', 'lloydsbank.com', 'natwest.com', 'santander.com',
152
+ 'transferwise.com', 'wise.com', 'revolut.com', 'monzo.com',
153
+ 'quickbooks.intuit.com', 'turbotax.intuit.com', 'mint.com', 'hrblock.com',
154
+ 'experian.com', 'equifax.com', 'transunion.com',
155
+ // ── E-commerce / Retail ──────────────────────────────────────────────────
156
+ 'amazon.com', 'ebay.com', 'etsy.com', 'walmart.com', 'target.com',
157
+ 'bestbuy.com', 'costco.com', 'homedepot.com', 'lowes.com', 'wayfair.com',
158
+ 'shopify.com', 'bigcommerce.com', 'woocommerce.com', 'squarespace.com',
159
+ 'overstock.com', 'newegg.com', 'bhphotovideo.com', 'adorama.com',
160
+ 'aliexpress.com', 'alibaba.com', 'wish.com', 'dhgate.com',
161
+ 'zappos.com', 'nordstrom.com', 'macys.com', 'bloomingdales.com', 'gap.com',
162
+ 'nike.com', 'adidas.com', 'reebok.com', 'underarmour.com', 'lululemon.com',
163
+ 'ikea.com', 'crate.com', 'potterybarn.com', 'williams-sonoma.com',
164
+ 'chewy.com', 'petco.com', 'petsmart.com',
165
+ 'instacart.com', 'doordash.com', 'ubereats.com', 'grubhub.com',
166
+ 'opentable.com', 'yelp.com', 'tripadvisor.com',
167
+ // ── Travel ────────────────────────────────────────────────────────────────
168
+ 'booking.com', 'expedia.com', 'airbnb.com', 'vrbo.com', 'kayak.com',
169
+ 'hotels.com', 'priceline.com', 'orbitz.com', 'travelocity.com',
170
+ 'delta.com', 'united.com', 'aa.com', 'southwest.com', 'jetblue.com',
171
+ 'marriott.com', 'hilton.com', 'hyatt.com', 'ihg.com', 'wyndham.com',
172
+ 'uber.com', 'lyft.com', 'waymo.com',
173
+ // ── Education ─────────────────────────────────────────────────────────────
174
+ 'coursera.org', 'edx.org', 'khanacademy.org', 'udemy.com', 'udacity.com',
175
+ 'pluralsight.com', 'lynda.com', 'linkedin.com', 'skillshare.com',
176
+ 'codecademy.com', 'freecodecamp.org', 'theodinproject.com',
177
+ 'brilliant.org', 'duolingo.com', 'babbel.com', 'rosettastone.com',
178
+ 'cambridgeinternational.org',
179
+ 'britannica.com', 'encyclopedia.com',
180
+ // ── Reference / Knowledge ─────────────────────────────────────────────────
181
+ 'wikipedia.org', 'wikimedia.org', 'wikihow.com', 'wikidata.org',
182
+ 'imdb.com', 'rottentomatoes.com', 'metacritic.com', 'goodreads.com',
183
+ 'nationalgeographic.com', 'smithsonianmag.com', 'history.com',
184
+ 'wolframalpha.com', 'dictionary.com', 'merriam-webster.com',
185
+ 'etymonline.com', 'thesaurus.com',
186
+ 'archive.org', 'waybackmachine.org',
187
+ // ── Health ────────────────────────────────────────────────────────────────
188
+ 'webmd.com', 'mayoclinic.org', 'clevelandclinic.org', 'healthline.com',
189
+ 'medicalnewstoday.com', 'everydayhealth.com', 'drugs.com',
190
+ 'rxlist.com', 'medscape.com', 'uptodate.com', 'emedicinehealth.com',
191
+ 'psych.org', 'nami.org', 'betterhelp.com', 'talkspace.com',
192
+ // ── Legal ─────────────────────────────────────────────────────────────────
193
+ 'law.cornell.edu', 'justia.com', 'findlaw.com', 'nolo.com', 'avvo.com',
194
+ 'legalzoom.com', 'rocket lawyer.com',
195
+ // ── Government / Civic (beyond TLD) ──────────────────────────────────────
196
+ 'gov.uk', 'gc.ca', 'australia.gov.au',
197
+ // ── Open Source / Misc Tech ──────────────────────────────────────────────
198
+ 'linux.org', 'kernel.org', 'gnu.org', 'apache.org', 'mozilla.org',
199
+ 'python.org', 'perl.org', 'haskell.org',
200
+ 'jquery.com', 'reactjs.org', 'react.dev', 'vuejs.org', 'angular.io',
201
+ 'svelte.dev', 'nextjs.org', 'nuxtjs.org', 'remix.run', 'astro.build',
202
+ 'tailwindcss.com', 'getbootstrap.com', 'mui.com', 'chakra-ui.com',
203
+ 'styled-components.com', 'emotion.sh',
204
+ 'vitejs.dev', 'webpack.js.org', 'rollupjs.org', 'esbuild.github.io',
205
+ 'babeljs.io', 'eslint.org', 'prettier.io', 'typescript.dev',
206
+ 'typescriptlang.org', 'deno.com', 'deno.land', 'bun.sh',
207
+ 'expressjs.com', 'fastify.io', 'nestjs.com', 'koajs.com', 'hapi.dev',
208
+ 'graphql.org', 'apollographql.com', 'trpc.io', 'grpc.io',
209
+ 'prisma.io', 'drizzle.team', 'typeorm.io', 'sequelize.org',
210
+ 'socket.io', 'feathersjs.com',
211
+ 'git-scm.com', 'gitkraken.com',
212
+ 'homebrew.sh', 'brew.sh', 'chocolatey.org', 'scoop.sh', 'winget.run',
213
+ 'ubuntu.com', 'debian.org', 'fedoraproject.org', 'archlinux.org',
214
+ 'redhat.com', 'suse.com', 'centos.org',
215
+ // ── Security / Privacy ───────────────────────────────────────────────────
216
+ 'haveibeenpwned.com', 'virustotal.com', '1password.com', 'bitwarden.com',
217
+ 'lastpass.com', 'dashlane.com', 'nordvpn.com', 'expressvpn.com',
218
+ 'protonmail.com', 'proton.me', 'tutanota.com', 'fastmail.com',
219
+ 'letsencrypt.org', 'ssllabs.com', 'namecheap.com', 'godaddy.com',
220
+ 'porkbun.com', 'cloudflare.com', 'dnschecker.org',
221
+ // ── Search ────────────────────────────────────────────────────────────────
222
+ 'google.com', 'bing.com', 'yahoo.com', 'duckduckgo.com', 'brave.com',
223
+ 'startpage.com', 'ecosia.org', 'kagi.com',
224
+ // ── Productivity ─────────────────────────────────────────────────────────
225
+ 'gmail.com', 'outlook.com', 'office.com', 'office365.com',
226
+ 'docs.google.com', 'drive.google.com', 'calendar.google.com',
227
+ 'maps.google.com', 'translate.google.com',
228
+ 'evernote.com', 'onenote.com', 'bear.app', 'obsidian.md',
229
+ 'cal.com', 'calendly.com', 'doodle.com', 'when2meet.com',
230
+ 'loom.com', 'screen.studio', 'cleanshot.com',
231
+ 'canva.com', 'unsplash.com', 'pexels.com', 'pixabay.com',
232
+ 'shutterstock.com', 'gettyimages.com', 'istockphoto.com',
233
+ 'giphy.com', 'tenor.com',
234
+ // ── Music / Media ─────────────────────────────────────────────────────────
235
+ 'soundcloud.com', 'bandcamp.com', 'last.fm', 'allmusic.com',
236
+ 'discogs.com', 'genius.com', 'azlyrics.com', 'musixmatch.com',
237
+ 'hulu.com', 'disneyplus.com', 'hbomax.com', 'max.com',
238
+ 'peacocktv.com', 'paramount.com', 'crunchyroll.com', 'funimation.com',
239
+ 'apple.com', 'music.apple.com',
240
+ // ── Gaming ───────────────────────────────────────────────────────────────
241
+ 'steam.com', 'steampowered.com', 'epicgames.com', 'gog.com',
242
+ 'itch.io', 'roblox.com', 'minecraft.net', 'ea.com',
243
+ 'activision.com', 'blizzard.com', 'battle.net', 'ubisoft.com',
244
+ 'nintendo.com', 'playstation.com', 'xbox.com',
245
+ 'ign.com', 'gamespot.com', 'kotaku.com', 'polygon.com',
246
+ 'pcgamer.com', 'rockpapershotgun.com',
247
+ // ── Science / Research ───────────────────────────────────────────────────
248
+ 'nasa.gov', 'esa.int', 'noaa.gov', 'nist.gov', 'usgs.gov',
249
+ 'epa.gov', 'energy.gov', 'nsf.gov',
250
+ 'acs.org', 'aps.org', 'aip.org', 'ams.org',
251
+ 'newsweek.com', 'scientificamerican.com', 'popularmechanics.com',
252
+ 'livescience.com', 'space.com', 'phys.org', 'sciencedaily.com',
253
+ 'technologyreview.com',
254
+ // ── Mapping / Location ────────────────────────────────────────────────────
255
+ 'openstreetmap.org', 'mapbox.com', 'here.com', 'waze.com',
256
+ 'zillow.com', 'redfin.com', 'realtor.com', 'trulia.com', 'apartments.com',
257
+ // ── HR / Recruiting ───────────────────────────────────────────────────────
258
+ 'indeed.com', 'glassdoor.com', 'monster.com', 'ziprecruiter.com',
259
+ 'careerbuilder.com', 'simplyhired.com', 'flexjobs.com', 'remote.com',
260
+ 'levels.fyi', 'teamblind.com', 'angellist.com', 'wellfound.com',
261
+ // ── Misc established ─────────────────────────────────────────────────────
262
+ 'hbr.org', 'mckinsey.com', 'bcg.com', 'bain.com', 'deloitte.com',
263
+ 'pwc.com', 'kpmg.com', 'ey.com', 'accenture.com',
264
+ 'gartner.com', 'idc.com', 'forrester.com',
265
+ 'ted.com', 'masterclass.com',
266
+ 'change.org', 'gofundme.com', 'kickstarter.com', 'indiegogo.com',
267
+ 'patreon.com', 'ko-fi.com', 'buymeacoffee.com',
268
+ 'webpeel.dev',
27
269
  ]);
28
- const VERIFIED_HOSTNAMES = new Set([
29
- // Encyclopaedia / reference
30
- 'wikipedia.org', 'en.wikipedia.org', 'britannica.com',
31
- // Reputable news agencies
32
- 'reuters.com', 'apnews.com', 'bbc.com', 'bbc.co.uk', 'nytimes.com',
33
- 'washingtonpost.com', 'theguardian.com', 'economist.com', 'ft.com',
34
- 'cnn.com', 'npr.org', 'pbs.org',
35
- // Developer resources
36
- 'github.com', 'stackoverflow.com', 'npmjs.com', 'pypi.org',
37
- 'crates.io', 'docs.rs', 'packagist.org', 'rubygems.org',
38
- // Official cloud / vendor docs
39
- 'docs.aws.amazon.com', 'cloud.google.com', 'docs.github.com',
40
- 'azure.microsoft.com', 'registry.terraform.io',
41
- // Reputable tech publications
42
- 'arstechnica.com', 'wired.com', 'techcrunch.com', 'theverge.com',
43
- // National Geographic, Smithsonian
44
- 'nationalgeographic.com', 'smithsonianmag.com',
270
+ // ---------------------------------------------------------------------------
271
+ // Community / content platforms — user content hosted on established infra
272
+ // ---------------------------------------------------------------------------
273
+ const COMMUNITY_PLATFORMS = new Map([
274
+ ['github.com', 'Community Content on GitHub'],
275
+ ['github.io', 'Personal Site on GitHub Pages'],
276
+ ['gitlab.com', 'Community Content on GitLab'],
277
+ ['medium.com', 'Article on Medium'],
278
+ ['substack.com', 'Newsletter on Substack'],
279
+ ['hashnode.com', 'Blog on Hashnode'],
280
+ ['dev.to', 'Article on DEV Community'],
281
+ ['wordpress.com', 'Blog on WordPress'],
282
+ ['blogspot.com', 'Blog on Blogger'],
283
+ ['blogger.com', 'Blog on Blogger'],
284
+ ['tumblr.com', 'Blog on Tumblr'],
285
+ ['weebly.com', 'Site on Weebly'],
286
+ ['wix.com', 'Site on Wix'],
287
+ ['squarespace.com', 'Site on Squarespace'],
288
+ ['webflow.io', 'Site on Webflow'],
289
+ ['vercel.app', 'Deployed Project on Vercel'],
290
+ ['netlify.app', 'Deployed Project on Netlify'],
291
+ ['pages.dev', 'Deployed Project on Cloudflare Pages'],
292
+ ['web.app', 'Firebase Hosted App'],
293
+ ['firebaseapp.com', 'Firebase Hosted App'],
294
+ ['herokuapp.com', 'App on Heroku'],
295
+ ['replit.dev', 'Project on Replit'],
296
+ ['glitch.me', 'Project on Glitch'],
297
+ ['codesandbox.io', 'Sandbox on CodeSandbox'],
298
+ ['stackblitz.com', 'Project on StackBlitz'],
299
+ ['codepen.io', 'Pen on CodePen'],
300
+ ['jsfiddle.net', 'Fiddle on JSFiddle'],
301
+ ['notion.site', 'Notion Page'],
302
+ ['gitbook.io', 'Docs on GitBook'],
303
+ ['gitbook.com', 'Docs on GitBook'],
304
+ ['readthedocs.io', 'Docs on Read the Docs'],
305
+ ['readthedocs.org', 'Docs on Read the Docs'],
306
+ ['reddit.com', 'Community Discussion on Reddit'],
307
+ ['news.ycombinator.com', 'Discussion on Hacker News'],
308
+ ['quora.com', 'Answer on Quora'],
309
+ ['stackoverflow.com', 'Answer on Stack Overflow'],
310
+ ['stackexchange.com', 'Answer on Stack Exchange'],
311
+ ['producthunt.com', 'Launch on Product Hunt'],
312
+ ['indiehackers.com', 'Post on Indie Hackers'],
313
+ ['hackernoon.com', 'Article on HackerNoon'],
314
+ ['lobste.rs', 'Discussion on Lobsters'],
315
+ ['lobsters.rs', 'Discussion on Lobsters'],
316
+ ['twitter.com', 'Post on X (Twitter)'],
317
+ ['x.com', 'Post on X (Twitter)'],
318
+ ['linkedin.com', 'Post on LinkedIn'],
319
+ ['youtube.com', 'Video on YouTube'],
320
+ ['vimeo.com', 'Video on Vimeo'],
321
+ ['twitch.tv', 'Stream on Twitch'],
322
+ ['soundcloud.com', 'Audio on SoundCloud'],
323
+ ['bandcamp.com', 'Music on Bandcamp'],
324
+ ['pinterest.com', 'Pin on Pinterest'],
325
+ ['instagram.com', 'Post on Instagram'],
326
+ ['tiktok.com', 'Video on TikTok'],
45
327
  ]);
328
+ // ---------------------------------------------------------------------------
329
+ // Brand-category labels for established domains
330
+ // ---------------------------------------------------------------------------
331
+ const DOMAIN_CATEGORY = {
332
+ // Tech
333
+ 'google.com': 'Established Technology Company',
334
+ 'apple.com': 'Established Technology Company',
335
+ 'microsoft.com': 'Established Technology Company',
336
+ 'amazon.com': 'Established E-commerce & Cloud Platform',
337
+ 'meta.com': 'Established Technology Company',
338
+ 'netflix.com': 'Established Streaming Service',
339
+ 'spotify.com': 'Established Music Streaming Service',
340
+ 'openai.com': 'Established AI Research Company',
341
+ 'anthropic.com': 'Established AI Research Company',
342
+ 'github.com': 'Established Developer Platform',
343
+ 'gitlab.com': 'Established Developer Platform',
344
+ 'stackoverflow.com': 'Established Developer Q&A Platform',
345
+ 'npmjs.com': 'Established Package Registry',
346
+ 'pypi.org': 'Established Package Registry',
347
+ 'docker.com': 'Established Container Platform',
348
+ 'vercel.com': 'Established Hosting Platform',
349
+ 'netlify.com': 'Established Hosting Platform',
350
+ 'cloudflare.com': 'Established CDN & Security Provider',
351
+ 'figma.com': 'Established Design Platform',
352
+ 'notion.so': 'Established Productivity Platform',
353
+ 'slack.com': 'Established Business Communication Platform',
354
+ 'zoom.us': 'Established Video Communication Platform',
355
+ 'adobe.com': 'Established Creative Software Company',
356
+ // News
357
+ 'nytimes.com': 'Established News Organization',
358
+ 'washingtonpost.com': 'Established News Organization',
359
+ 'theguardian.com': 'Established News Organization',
360
+ 'bbc.com': 'Established News Organization',
361
+ 'bbc.co.uk': 'Established News Organization',
362
+ 'reuters.com': 'Established News Agency',
363
+ 'apnews.com': 'Established News Agency',
364
+ 'bloomberg.com': 'Established Financial News Organization',
365
+ 'economist.com': 'Established News Publication',
366
+ 'ft.com': 'Established Financial News Organization',
367
+ 'wsj.com': 'Established Financial News Organization',
368
+ 'cnn.com': 'Established News Organization',
369
+ 'npr.org': 'Established Public Radio',
370
+ 'techcrunch.com': 'Established Technology News Publication',
371
+ 'wired.com': 'Established Technology News Publication',
372
+ 'arstechnica.com': 'Established Technology News Publication',
373
+ 'theverge.com': 'Established Technology News Publication',
374
+ // Finance
375
+ 'paypal.com': 'Established Payment Platform',
376
+ 'stripe.com': 'Established Payment Platform',
377
+ 'square.com': 'Established Payment Platform',
378
+ 'coinbase.com': 'Established Cryptocurrency Exchange',
379
+ 'chase.com': 'Established Financial Institution',
380
+ 'bankofamerica.com': 'Established Financial Institution',
381
+ 'wellsfargo.com': 'Established Financial Institution',
382
+ // E-commerce
383
+ 'ebay.com': 'Established E-commerce Marketplace',
384
+ 'etsy.com': 'Established Handmade Marketplace',
385
+ 'walmart.com': 'Established Retail Company',
386
+ 'target.com': 'Established Retail Company',
387
+ 'bestbuy.com': 'Established Electronics Retailer',
388
+ 'shopify.com': 'Established E-commerce Platform',
389
+ // Education
390
+ 'coursera.org': 'Established Online Education Platform',
391
+ 'edx.org': 'Established Online Education Platform',
392
+ 'khanacademy.org': 'Non-Profit Education Platform',
393
+ 'udemy.com': 'Established Online Learning Marketplace',
394
+ 'britannica.com': 'Established Reference Encyclopedia',
395
+ 'wikipedia.org': 'Open Encyclopedia (Community Edited)',
396
+ // Reference
397
+ 'archive.org': 'Established Digital Archive',
398
+ 'wolframalpha.com': 'Established Computational Knowledge Engine',
399
+ 'imdb.com': 'Established Movie & TV Database',
400
+ };
401
+ // ---------------------------------------------------------------------------
402
+ // Helpers
403
+ // ---------------------------------------------------------------------------
404
+ function extractTLD(hostname) {
405
+ const parts = hostname.split('.');
406
+ if (parts.length < 2)
407
+ return '';
408
+ return '.' + parts.slice(-1)[0];
409
+ }
410
+ function extractSLD(hostname) {
411
+ // Returns registrable domain (e.g. "google.com")
412
+ const parts = hostname.split('.');
413
+ if (parts.length < 2)
414
+ return hostname;
415
+ return parts.slice(-2).join('.');
416
+ }
417
+ function countSubdomains(hostname) {
418
+ // www.example.com → 0 subdomains (www doesn't count)
419
+ const stripped = hostname.replace(/^www\./, '');
420
+ const parts = stripped.split('.');
421
+ return Math.max(0, parts.length - 2);
422
+ }
423
+ // ---------------------------------------------------------------------------
424
+ // Main export
425
+ // ---------------------------------------------------------------------------
46
426
  /**
47
427
  * Assess the credibility of a source URL.
428
+ * Fully synchronous — no network calls.
48
429
  */
49
430
  export function getSourceCredibility(url) {
431
+ const signals = [];
432
+ const warnings = [];
433
+ let score = 0;
434
+ // ── Parse URL ─────────────────────────────────────────────────────────────
435
+ let parsedUrl;
50
436
  try {
51
- const hostname = new URL(url).hostname.toLowerCase().replace(/^www\./, '');
52
- // Check official TLDs
53
- for (const tld of OFFICIAL_TLDS) {
54
- if (hostname.endsWith(tld)) {
55
- return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
56
- }
57
- }
58
- // Check known official hostnames
59
- if (OFFICIAL_HOSTNAMES.has(hostname)) {
60
- return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
61
- }
62
- // Check parent domain (e.g. en.wikipedia.org → wikipedia.org)
63
- const parts = hostname.split('.');
64
- if (parts.length > 2) {
65
- const parentDomain = parts.slice(-2).join('.');
66
- if (OFFICIAL_HOSTNAMES.has(parentDomain)) {
67
- return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
68
- }
69
- if (VERIFIED_HOSTNAMES.has(parentDomain)) {
70
- return { tier: 'verified', stars: 2, label: 'VERIFIED' };
71
- }
72
- }
73
- // Check known verified hostnames
74
- if (VERIFIED_HOSTNAMES.has(hostname)) {
75
- return { tier: 'verified', stars: 2, label: 'VERIFIED' };
76
- }
77
- // Everything else
78
- return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
437
+ parsedUrl = new URL(url);
79
438
  }
80
439
  catch {
81
- return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
440
+ return {
441
+ tier: 'suspicious',
442
+ score: 0,
443
+ label: 'Invalid URL — Cannot Assess',
444
+ signals: [],
445
+ warnings: ['URL could not be parsed'],
446
+ };
447
+ }
448
+ const protocol = parsedUrl.protocol; // 'https:' or 'http:'
449
+ const rawHostname = parsedUrl.hostname.toLowerCase();
450
+ const hostname = rawHostname.replace(/^www\./, '');
451
+ const tld = extractTLD(hostname);
452
+ const sld = extractSLD(hostname); // e.g. "google.com"
453
+ const subdomainCount = countSubdomains(rawHostname);
454
+ // ── 1. HTTPS check (0–10 pts) ─────────────────────────────────────────────
455
+ if (protocol === 'https:') {
456
+ score += 10;
457
+ signals.push('HTTPS enforced');
458
+ }
459
+ else {
460
+ warnings.push('HTTP only — no encryption');
461
+ }
462
+ // ── 2. TLD trust (0–20 pts) ───────────────────────────────────────────────
463
+ const tldScore = TLD_TRUST[tld] ?? 5;
464
+ score += tldScore;
465
+ if (tldScore >= 18) {
466
+ signals.push(`Trusted TLD (${tld})`);
467
+ }
468
+ else if (tldScore <= 3) {
469
+ warnings.push(`High-risk TLD (${tld}) — commonly used in phishing`);
470
+ }
471
+ // ── 3. Official TLD shortcut ──────────────────────────────────────────────
472
+ if (OFFICIAL_TLDS.has(tld) || OFFICIAL_DOMAINS.has(hostname) || OFFICIAL_DOMAINS.has(sld)) {
473
+ const category = DOMAIN_CATEGORY[hostname] ?? DOMAIN_CATEGORY[sld] ?? 'Official Source';
474
+ return {
475
+ tier: 'official',
476
+ score: Math.min(100, score + 40 + 15),
477
+ label: tld === '.gov' ? 'Official Government Source' :
478
+ tld === '.edu' ? 'Official Educational Institution' :
479
+ tld === '.mil' ? 'Official Military Source' :
480
+ tld === '.int' ? 'International Organization' :
481
+ category,
482
+ signals: [...signals, 'Official domain verified', `Trusted TLD (${tld})`].filter((v, i, a) => a.indexOf(v) === i),
483
+ warnings,
484
+ };
485
+ }
486
+ // ── 4. Domain structure (0–15 pts) ────────────────────────────────────────
487
+ if (subdomainCount === 0) {
488
+ score += 15;
489
+ signals.push('Clean domain structure');
490
+ }
491
+ else if (subdomainCount === 1) {
492
+ score += 10;
493
+ signals.push('Standard subdomain structure');
494
+ }
495
+ else if (subdomainCount === 2) {
496
+ score += 5;
497
+ }
498
+ else {
499
+ // 3+ subdomains — possible phishing pattern
500
+ score += 0;
501
+ warnings.push(`Excessive subdomains (${subdomainCount}) — potential phishing indicator`);
502
+ }
503
+ // ── 5 & 6. Known domain + Community platform (mutually exclusive bonus) ──
504
+ // Community platform detection — user content on a known hosting platform.
505
+ // When the domain is a community platform, it gets the platform bonus (15 pts)
506
+ // but NOT the established domain bonus (they're conceptually different tiers).
507
+ const communityLabel = COMMUNITY_PLATFORMS.get(hostname) ?? COMMUNITY_PLATFORMS.get(sld);
508
+ const isEstablished = ESTABLISHED_DOMAINS.has(hostname) || ESTABLISHED_DOMAINS.has(sld);
509
+ if (communityLabel) {
510
+ // Platform bonus only — user content hosted on verified infra
511
+ score += 15;
512
+ signals.push(`Hosted on verified platform (${sld})`);
513
+ }
514
+ else if (isEstablished) {
515
+ // Full established domain bonus
516
+ score += 40;
517
+ signals.push('Recognized established domain');
518
+ }
519
+ // ── 7. Suspicious TLD ─────────────────────────────────────────────────────
520
+ if (SUSPICIOUS_TLDS.has(tld)) {
521
+ score = Math.min(score, 15); // Cap at suspicious tier
522
+ warnings.push('Domain uses a free TLD associated with fraud');
523
+ }
524
+ // ── 8. Phishing keyword detection ─────────────────────────────────────────
525
+ const phishingKeywords = ['paypal-', 'apple-', 'google-', 'microsoft-', 'amazon-',
526
+ 'bank-', 'login-', 'signin-', 'secure-', 'verify-', 'account-', 'update-',
527
+ 'support-', 'helpdesk-', '-login', '-signin', '-secure', '-verify', '-account',
528
+ 'paypal.', 'apple.', 'google.', 'microsoft.', 'amazon.'];
529
+ const suspiciousPattern = phishingKeywords.some(kw => hostname.includes(kw) && !isEstablished && !communityLabel);
530
+ if (suspiciousPattern) {
531
+ score = Math.min(score, 19);
532
+ warnings.push('Domain contains impersonation keywords — potential phishing');
533
+ }
534
+ // ── Clamp score ───────────────────────────────────────────────────────────
535
+ score = Math.max(0, Math.min(100, score));
536
+ // ── Tier assignment ───────────────────────────────────────────────────────
537
+ let tier;
538
+ if (score >= 90)
539
+ tier = 'official';
540
+ else if (score >= 60)
541
+ tier = 'established';
542
+ else if (score >= 40)
543
+ tier = 'community';
544
+ else if (score >= 20)
545
+ tier = 'new';
546
+ else
547
+ tier = 'suspicious';
548
+ // ── Label generation ──────────────────────────────────────────────────────
549
+ let label;
550
+ if (communityLabel) {
551
+ label = communityLabel;
552
+ }
553
+ else if (isEstablished) {
554
+ label = DOMAIN_CATEGORY[hostname] ?? DOMAIN_CATEGORY[sld] ?? labelFromTier(tier, hostname, tld);
555
+ }
556
+ else {
557
+ label = labelFromTier(tier, hostname, tld);
558
+ }
559
+ return { tier, score, label, signals, warnings };
560
+ }
561
+ // ---------------------------------------------------------------------------
562
+ // Generate a useful fallback label based on tier + domain context
563
+ // ---------------------------------------------------------------------------
564
+ function labelFromTier(tier, _hostname, tld) {
565
+ switch (tier) {
566
+ case 'official':
567
+ return 'Official Source';
568
+ case 'established':
569
+ return tld === '.org' ? 'Established Organization' :
570
+ tld === '.net' ? 'Established Network Service' :
571
+ tld === '.io' ? 'Established Tech Service' :
572
+ 'Established Website';
573
+ case 'community':
574
+ return 'Community or Independent Website';
575
+ case 'new':
576
+ return 'Small or Recently Established Website';
577
+ case 'suspicious':
578
+ return SUSPICIOUS_TLDS.has(tld)
579
+ ? `Free Domain TLD (${tld}) — Exercise Caution`
580
+ : 'Unrecognized Domain — Exercise Caution';
581
+ default:
582
+ return 'Unknown Domain — Limited Verification Available';
82
583
  }
83
584
  }
package/dist/index.d.ts CHANGED
@@ -42,6 +42,9 @@ export type SearchFallbackResult = {
42
42
  };
43
43
  export declare function searchFallback(..._args: any[]): Promise<SearchFallbackResult | null>;
44
44
  export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS, type PeelTLSOptions, type PeelTLSResult } from './core/peel-tls.js';
45
+ export { sanitizeForLLM, type SanitizeResult } from './core/prompt-guard.js';
46
+ export { getSourceCredibility, type SourceCredibility } from './core/source-credibility.js';
47
+ export { checkUrlSafety, type SafeBrowsingResult } from './core/safe-browsing.js';
45
48
  /**
46
49
  * Fetch and extract content from a URL
47
50
  *
package/dist/index.js CHANGED
@@ -5,6 +5,7 @@
5
5
  */
6
6
  import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from './core/fetcher.js';
7
7
  import { createContext, normalizeOptions, handleYouTube, fetchContent, detectContentType, parseContent, postProcess, finalize, buildResult, } from './core/pipeline.js';
8
+ import { checkUrlSafety } from './core/safe-browsing.js';
8
9
  export * from './types.js';
9
10
  export { getDomainExtractor, extractDomainData } from './core/domain-extractors.js';
10
11
  export { crawl } from './core/crawler.js';
@@ -47,6 +48,9 @@ export async function searchFallback(..._args) {
47
48
  }
48
49
  }
49
50
  export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS } from './core/peel-tls.js';
51
+ export { sanitizeForLLM } from './core/prompt-guard.js';
52
+ export { getSourceCredibility } from './core/source-credibility.js';
53
+ export { checkUrlSafety } from './core/safe-browsing.js';
50
54
  /**
51
55
  * Fetch and extract content from a URL
52
56
  *
@@ -66,16 +70,34 @@ export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS } from './core/peel-t
66
70
  export async function peel(url, options = {}) {
67
71
  const ctx = createContext(url, options);
68
72
  normalizeOptions(ctx);
73
+ // Safe Browsing check — runs before any HTTP request, non-blocking
74
+ const sbResult = await checkUrlSafety(url, process.env.SAFE_BROWSING_API_KEY);
75
+ ctx.safeBrowsingResult = sbResult;
76
+ if (!sbResult.safe) {
77
+ const threatList = sbResult.threats.join(', ');
78
+ ctx.warnings.push(`⚠️ URL flagged by Safe Browsing: ${threatList}`);
79
+ }
69
80
  const ytResult = await handleYouTube(ctx);
70
- if (ytResult)
71
- return ytResult;
81
+ if (ytResult) {
82
+ // Attach safe browsing to YouTube results too
83
+ return {
84
+ ...ytResult,
85
+ safeBrowsing: sbResult,
86
+ ...(ytResult.warnings || ctx.warnings.length > 0
87
+ ? { warnings: [...(ytResult.warnings ?? []), ...ctx.warnings.filter(w => !ytResult.warnings?.includes(w))] }
88
+ : {}),
89
+ };
90
+ }
72
91
  try {
73
92
  await fetchContent(ctx);
74
93
  detectContentType(ctx);
75
94
  await parseContent(ctx);
76
95
  await postProcess(ctx);
77
96
  await finalize(ctx);
78
- return buildResult(ctx);
97
+ const result = buildResult(ctx);
98
+ // Attach safe browsing result
99
+ result.safeBrowsing = sbResult;
100
+ return result;
79
101
  }
80
102
  catch (error) {
81
103
  // Clean up browser resources on error
@@ -212,16 +212,16 @@ export function createSearchRouter(authStore) {
212
212
  }
213
213
  }
214
214
  // Add credibility scores and sort by trustworthiness
215
- const tierOrder = { official: 0, verified: 1, general: 2 };
215
+ const tierOrder = { official: 0, established: 1, community: 2, new: 3, suspicious: 4 };
216
216
  results = results
217
217
  .map(r => {
218
218
  const cred = getSourceCredibility(r.url);
219
219
  return { ...r, credibility: cred };
220
220
  })
221
221
  .sort((a, b) => {
222
- const aTier = tierOrder[a.credibility?.tier || 'general'] ?? 2;
223
- const bTier = tierOrder[b.credibility?.tier || 'general'] ?? 2;
224
- return aTier - bTier; // Official first, then verified, then general
222
+ const aTier = tierOrder[a.credibility?.tier || 'new'] ?? 3;
223
+ const bTier = tierOrder[b.credibility?.tier || 'new'] ?? 3;
224
+ return aTier - bTier; // Official first, then established, community, new, suspicious
225
225
  })
226
226
  .map((r, i) => ({ ...r, rank: i + 1 }));
227
227
  data.web = results;
package/dist/types.d.ts CHANGED
@@ -339,6 +339,28 @@ export interface PeelResult {
339
339
  rawTokenEstimate?: number;
340
340
  /** Token savings percentage compared to raw HTML (how much cheaper WebPeel is) */
341
341
  tokenSavingsPercent?: number;
342
+ /** Trust & safety assessment of the fetched content */
343
+ trust?: {
344
+ /** Source credibility tier */
345
+ source: {
346
+ tier: 'official' | 'established' | 'community' | 'new' | 'suspicious';
347
+ score: number;
348
+ label: string;
349
+ signals?: string[];
350
+ warnings?: string[];
351
+ };
352
+ /** Prompt injection scan result */
353
+ contentSafety: {
354
+ clean: boolean;
355
+ injectionDetected: boolean;
356
+ detectedPatterns: string[];
357
+ strippedCount: number;
358
+ };
359
+ /** Overall trust score 0-1 (composite of source + content safety) */
360
+ score: number;
361
+ /** Human-readable safety warnings */
362
+ warnings: string[];
363
+ };
342
364
  /** Content chunks (when chunk option is enabled) */
343
365
  chunks?: Array<{
344
366
  index: number;
@@ -350,6 +372,12 @@ export interface PeelResult {
350
372
  startOffset: number;
351
373
  endOffset: number;
352
374
  }>;
375
+ /** Safe Browsing check result */
376
+ safeBrowsing?: {
377
+ safe: boolean;
378
+ threats: string[];
379
+ source: 'google-api' | 'local-blocklist' | 'unchecked';
380
+ };
353
381
  }
354
382
  export interface PageMetadata {
355
383
  /** Meta description */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.58",
3
+ "version": "0.21.60",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",