webpeel 0.21.58 → 0.21.60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/pipeline.d.ts +3 -0
- package/dist/core/pipeline.js +47 -0
- package/dist/core/safe-browsing.d.ts +22 -0
- package/dist/core/safe-browsing.js +183 -0
- package/dist/core/source-credibility.d.ts +19 -7
- package/dist/core/source-credibility.js +563 -62
- package/dist/index.d.ts +3 -0
- package/dist/index.js +25 -3
- package/dist/server/routes/search.js +4 -4
- package/dist/types.d.ts +28 -0
- package/package.json +1 -1
package/dist/core/pipeline.d.ts
CHANGED
|
@@ -13,6 +13,7 @@ import type { PeelOptions, PeelResult, ImageInfo } from '../types.js';
|
|
|
13
13
|
import type { BrandingProfile } from './branding.js';
|
|
14
14
|
import type { ChangeResult } from './change-tracking.js';
|
|
15
15
|
import type { DesignAnalysis } from './design-analysis.js';
|
|
16
|
+
import type { SafeBrowsingResult } from './safe-browsing.js';
|
|
16
17
|
/** Mutable context threaded through pipeline stages */
|
|
17
18
|
export interface PipelineContext {
|
|
18
19
|
url: string;
|
|
@@ -81,6 +82,8 @@ export interface PipelineContext {
|
|
|
81
82
|
warnings: string[];
|
|
82
83
|
/** Raw HTML size in characters (measured from fetched content before any conversion) */
|
|
83
84
|
rawHtmlSize?: number;
|
|
85
|
+
/** Safe Browsing check result (set early in pipeline, before fetch) */
|
|
86
|
+
safeBrowsingResult?: SafeBrowsingResult;
|
|
84
87
|
}
|
|
85
88
|
/** Create the initial PipelineContext with defaults */
|
|
86
89
|
export declare function createContext(url: string, options: PeelOptions): PipelineContext;
|
package/dist/core/pipeline.js
CHANGED
|
@@ -20,6 +20,8 @@ import { quickAnswer as runQuickAnswer } from './quick-answer.js';
|
|
|
20
20
|
import { Timer } from './timing.js';
|
|
21
21
|
import { chunkContent } from './chunker.js';
|
|
22
22
|
import { BlockedError } from '../types.js';
|
|
23
|
+
import { sanitizeForLLM } from './prompt-guard.js';
|
|
24
|
+
import { getSourceCredibility } from './source-credibility.js';
|
|
23
25
|
import { createLogger } from './logger.js';
|
|
24
26
|
const log = createLogger('pipeline');
|
|
25
27
|
/** Create the initial PipelineContext with defaults */
|
|
@@ -1245,6 +1247,50 @@ export async function finalize(ctx) {
|
|
|
1245
1247
|
export function buildResult(ctx) {
|
|
1246
1248
|
const fetchResult = ctx.fetchResult;
|
|
1247
1249
|
const elapsed = Date.now() - ctx.startTime;
|
|
1250
|
+
// --- Trust & Safety ---
|
|
1251
|
+
// Run prompt injection scan on final content
|
|
1252
|
+
const sanitizeResult = sanitizeForLLM(ctx.content);
|
|
1253
|
+
// If injection was detected, use the cleaned content
|
|
1254
|
+
if (sanitizeResult.injectionDetected) {
|
|
1255
|
+
ctx.content = sanitizeResult.content;
|
|
1256
|
+
ctx.warnings.push('Prompt injection patterns detected and stripped from content.');
|
|
1257
|
+
}
|
|
1258
|
+
// Assess source credibility
|
|
1259
|
+
const credibility = getSourceCredibility(ctx.url);
|
|
1260
|
+
// Compute composite trust score from source credibility (0-100) + content safety
|
|
1261
|
+
let trustScore = credibility.score / 100; // normalize 0-100 → 0-1
|
|
1262
|
+
if (sanitizeResult.injectionDetected)
|
|
1263
|
+
trustScore -= 0.3;
|
|
1264
|
+
if ((ctx.quality ?? 1.0) < 0.5)
|
|
1265
|
+
trustScore -= 0.1;
|
|
1266
|
+
trustScore = Math.round(Math.max(0, Math.min(1, trustScore)) * 100) / 100;
|
|
1267
|
+
// Build trust warnings
|
|
1268
|
+
const trustWarnings = [...(credibility.warnings ?? [])];
|
|
1269
|
+
if (credibility.tier === 'new')
|
|
1270
|
+
trustWarnings.push('Domain has limited verifiable presence — exercise caution.');
|
|
1271
|
+
if (credibility.tier === 'suspicious')
|
|
1272
|
+
trustWarnings.push('Domain shows suspicious signals — treat content with caution.');
|
|
1273
|
+
if (sanitizeResult.injectionDetected)
|
|
1274
|
+
trustWarnings.push(`Prompt injection detected: ${sanitizeResult.detectedPatterns.join(', ')}`);
|
|
1275
|
+
if (sanitizeResult.strippedChars > 0)
|
|
1276
|
+
trustWarnings.push(`Stripped ${sanitizeResult.strippedChars} suspicious characters (zero-width/Unicode smuggling).`);
|
|
1277
|
+
const trust = {
|
|
1278
|
+
source: {
|
|
1279
|
+
tier: credibility.tier,
|
|
1280
|
+
score: credibility.score,
|
|
1281
|
+
label: credibility.label,
|
|
1282
|
+
signals: credibility.signals,
|
|
1283
|
+
warnings: credibility.warnings,
|
|
1284
|
+
},
|
|
1285
|
+
contentSafety: {
|
|
1286
|
+
clean: !sanitizeResult.injectionDetected,
|
|
1287
|
+
injectionDetected: sanitizeResult.injectionDetected,
|
|
1288
|
+
detectedPatterns: sanitizeResult.detectedPatterns,
|
|
1289
|
+
strippedCount: sanitizeResult.strippedChars,
|
|
1290
|
+
},
|
|
1291
|
+
score: trustScore,
|
|
1292
|
+
warnings: trustWarnings,
|
|
1293
|
+
};
|
|
1248
1294
|
const tokens = estimateTokens(ctx.content);
|
|
1249
1295
|
const fingerprint = createHash('sha256').update(ctx.content).digest('hex').slice(0, 16);
|
|
1250
1296
|
// Token savings metrics — only when raw HTML size was captured (from actual fetch or domain extractor)
|
|
@@ -1342,5 +1388,6 @@ export function buildResult(ctx) {
|
|
|
1342
1388
|
...(rawTokenEstimate !== undefined ? { rawTokenEstimate } : {}),
|
|
1343
1389
|
...(tokenSavingsPercent !== undefined ? { tokenSavingsPercent } : {}),
|
|
1344
1390
|
...(fetchResult.autoInteract !== undefined ? { autoInteract: fetchResult.autoInteract } : {}),
|
|
1391
|
+
trust,
|
|
1345
1392
|
};
|
|
1346
1393
|
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Domain safety check using Google Safe Browsing Lookup API v4.
|
|
3
|
+
* Free: 10,000 lookups/day.
|
|
4
|
+
* Falls back to a local blocklist when no API key is configured.
|
|
5
|
+
*/
|
|
6
|
+
export interface SafeBrowsingResult {
|
|
7
|
+
safe: boolean;
|
|
8
|
+
threats: string[];
|
|
9
|
+
source: 'google-api' | 'local-blocklist' | 'unchecked';
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Check URL safety.
|
|
13
|
+
*
|
|
14
|
+
* Flow:
|
|
15
|
+
* 1. If SAFE_BROWSING_API_KEY (or passed apiKey) is set, race Google API vs 2s timeout.
|
|
16
|
+
* Falls back to local blocklist on timeout or error.
|
|
17
|
+
* 2. Without an API key, use local heuristic blocklist only.
|
|
18
|
+
*
|
|
19
|
+
* @param url The URL to check
|
|
20
|
+
* @param apiKey Google Safe Browsing API key (optional). Falls back to SAFE_BROWSING_API_KEY env var.
|
|
21
|
+
*/
|
|
22
|
+
export declare function checkUrlSafety(url: string, apiKey?: string): Promise<SafeBrowsingResult>;
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Domain safety check using Google Safe Browsing Lookup API v4.
|
|
3
|
+
* Free: 10,000 lookups/day.
|
|
4
|
+
* Falls back to a local blocklist when no API key is configured.
|
|
5
|
+
*/
|
|
6
|
+
// Known brands commonly impersonated in phishing
|
|
7
|
+
const KNOWN_BRANDS = [
|
|
8
|
+
'amazon', 'google', 'facebook', 'apple', 'microsoft', 'paypal', 'netflix',
|
|
9
|
+
'instagram', 'twitter', 'linkedin', 'dropbox', 'chase', 'wellsfargo', 'bankofamerica',
|
|
10
|
+
'citibank', 'hsbc', 'ebay', 'walmart', 'target', 'bestbuy', 'fedex', 'ups', 'usps',
|
|
11
|
+
'irs', 'dmv', 'gov', 'yahoo', 'outlook', 'hotmail',
|
|
12
|
+
];
|
|
13
|
+
// TLDs heavily abused for phishing/malware (free-domain registrars)
|
|
14
|
+
const SUSPICIOUS_TLDS = new Set(['.tk', '.ml', '.ga', '.cf', '.gq', '.top', '.click', '.loan', '.win', '.xyz', '.club', '.work']);
|
|
15
|
+
// Private/reserved IPv4 ranges (safe for local dev)
|
|
16
|
+
const PRIVATE_IP_RANGES = [
|
|
17
|
+
/^127\.\d+\.\d+\.\d+$/, // loopback
|
|
18
|
+
/^10\.\d+\.\d+\.\d+$/, // RFC 1918
|
|
19
|
+
/^192\.168\.\d+\.\d+$/, // RFC 1918
|
|
20
|
+
/^172\.(1[6-9]|2\d|3[01])\.\d+\.\d+$/, // RFC 1918
|
|
21
|
+
/^169\.254\.\d+\.\d+$/, // link-local
|
|
22
|
+
/^::1$/, // IPv6 loopback
|
|
23
|
+
/^fc00:/, // IPv6 private
|
|
24
|
+
/^fd[0-9a-f]{2}:/i, // IPv6 ULA
|
|
25
|
+
];
|
|
26
|
+
function isPrivateIp(host) {
|
|
27
|
+
return PRIVATE_IP_RANGES.some((re) => re.test(host));
|
|
28
|
+
}
|
|
29
|
+
function isIpAddress(host) {
|
|
30
|
+
// IPv4
|
|
31
|
+
if (/^\d{1,3}(\.\d{1,3}){3}$/.test(host))
|
|
32
|
+
return true;
|
|
33
|
+
// IPv6 (bare or bracketed)
|
|
34
|
+
if (/^\[?[0-9a-fA-F:]+\]?$/.test(host))
|
|
35
|
+
return true;
|
|
36
|
+
return false;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Local heuristic blocklist — catches common attack patterns without an API key.
|
|
40
|
+
*/
|
|
41
|
+
function checkLocalBlocklist(url) {
|
|
42
|
+
const threats = [];
|
|
43
|
+
// 1. Data URIs — always suspicious
|
|
44
|
+
if (/^data:/i.test(url.trim())) {
|
|
45
|
+
threats.push('DATA_URI');
|
|
46
|
+
return { safe: false, threats, source: 'local-blocklist' };
|
|
47
|
+
}
|
|
48
|
+
let parsed = null;
|
|
49
|
+
try {
|
|
50
|
+
parsed = new URL(url);
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
// Unparseable URL — flag as suspicious
|
|
54
|
+
threats.push('INVALID_URL');
|
|
55
|
+
return { safe: false, threats, source: 'local-blocklist' };
|
|
56
|
+
}
|
|
57
|
+
const { hostname, username, password } = parsed;
|
|
58
|
+
// 2. @ sign trick: http://google.com@evil.com/login → username = 'google.com'
|
|
59
|
+
if (username || password) {
|
|
60
|
+
threats.push('URL_CREDENTIALS_TRICK');
|
|
61
|
+
return { safe: false, threats, source: 'local-blocklist' };
|
|
62
|
+
}
|
|
63
|
+
// 3. Punycode homograph attacks (xn-- internationalized domains)
|
|
64
|
+
if (/\bxn--/i.test(hostname)) {
|
|
65
|
+
// Allow legitimate IDN TLDs (e.g. .xn--p1ai = .рф)
|
|
66
|
+
const parts = hostname.split('.');
|
|
67
|
+
const hasPunycodeLabel = parts.slice(0, -1).some((p) => /^xn--/i.test(p));
|
|
68
|
+
if (hasPunycodeLabel) {
|
|
69
|
+
threats.push('PUNYCODE_HOMOGRAPH');
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// 4. IP-only URLs pointing to non-private ranges
|
|
73
|
+
if (isIpAddress(hostname)) {
|
|
74
|
+
const bare = hostname.replace(/^\[|\]$/g, ''); // strip brackets from IPv6
|
|
75
|
+
if (!isPrivateIp(bare)) {
|
|
76
|
+
threats.push('SUSPICIOUS_IP');
|
|
77
|
+
}
|
|
78
|
+
if (threats.length > 0)
|
|
79
|
+
return { safe: false, threats, source: 'local-blocklist' };
|
|
80
|
+
return { safe: true, threats: [], source: 'local-blocklist' };
|
|
81
|
+
}
|
|
82
|
+
const lowerHost = hostname.toLowerCase();
|
|
83
|
+
// Remove www prefix for analysis
|
|
84
|
+
const hostNoWww = lowerHost.replace(/^www\./, '');
|
|
85
|
+
const parts = hostNoWww.split('.');
|
|
86
|
+
const tld = parts.length >= 2 ? '.' + parts[parts.length - 1] : '';
|
|
87
|
+
const sld = parts.length >= 2 ? parts[parts.length - 2] : '';
|
|
88
|
+
// 5. Known-bad TLDs combined with brand names (amazon-login.tk)
|
|
89
|
+
if (SUSPICIOUS_TLDS.has(tld)) {
|
|
90
|
+
const containsBrand = KNOWN_BRANDS.some((brand) => hostNoWww.includes(brand));
|
|
91
|
+
if (containsBrand) {
|
|
92
|
+
threats.push('PHISHING');
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
// 6. Excessive hyphens in SLD (amaz0n-login-verify-account.com)
|
|
96
|
+
const hyphenCount = (sld.match(/-/g) || []).length;
|
|
97
|
+
if (hyphenCount >= 3) {
|
|
98
|
+
threats.push('EXCESSIVE_HYPHENS');
|
|
99
|
+
}
|
|
100
|
+
// 7. Brand name in subdomain combined with suspicious TLD
|
|
101
|
+
if (SUSPICIOUS_TLDS.has(tld)) {
|
|
102
|
+
const subdomains = parts.slice(0, -2).join('.');
|
|
103
|
+
const subHasBrand = KNOWN_BRANDS.some((brand) => subdomains.includes(brand));
|
|
104
|
+
if (subHasBrand && !threats.includes('PHISHING')) {
|
|
105
|
+
threats.push('PHISHING');
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// 8. Excessive subdomains: login.secure.verify.account.bank.xyz.com
|
|
109
|
+
if (parts.length > 5) {
|
|
110
|
+
threats.push('EXCESSIVE_SUBDOMAINS');
|
|
111
|
+
}
|
|
112
|
+
if (threats.length > 0) {
|
|
113
|
+
return { safe: false, threats, source: 'local-blocklist' };
|
|
114
|
+
}
|
|
115
|
+
return { safe: true, threats: [], source: 'local-blocklist' };
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Check a URL against the Google Safe Browsing Lookup API v4.
|
|
119
|
+
* Returns null on any error (network timeout, bad key, etc.) so caller can fall back.
|
|
120
|
+
*/
|
|
121
|
+
async function checkGoogleSafeBrowsing(url, apiKey) {
|
|
122
|
+
const endpoint = `https://safebrowsing.googleapis.com/v4/threatMatches:find?key=${encodeURIComponent(apiKey)}`;
|
|
123
|
+
const body = {
|
|
124
|
+
client: { clientId: 'webpeel', clientVersion: '1.0.0' },
|
|
125
|
+
threatInfo: {
|
|
126
|
+
threatTypes: ['MALWARE', 'SOCIAL_ENGINEERING', 'UNWANTED_SOFTWARE', 'POTENTIALLY_HARMFUL_APPLICATION'],
|
|
127
|
+
platformTypes: ['ANY_PLATFORM'],
|
|
128
|
+
threatEntryTypes: ['URL'],
|
|
129
|
+
threatEntries: [{ url }],
|
|
130
|
+
},
|
|
131
|
+
};
|
|
132
|
+
const controller = new AbortController();
|
|
133
|
+
const timeoutId = setTimeout(() => controller.abort(), 2000);
|
|
134
|
+
try {
|
|
135
|
+
const resp = await fetch(endpoint, {
|
|
136
|
+
method: 'POST',
|
|
137
|
+
headers: { 'Content-Type': 'application/json' },
|
|
138
|
+
body: JSON.stringify(body),
|
|
139
|
+
signal: controller.signal,
|
|
140
|
+
});
|
|
141
|
+
clearTimeout(timeoutId);
|
|
142
|
+
if (!resp.ok)
|
|
143
|
+
return null;
|
|
144
|
+
const data = await resp.json();
|
|
145
|
+
if (!data.matches || data.matches.length === 0) {
|
|
146
|
+
return { safe: true, threats: [], source: 'google-api' };
|
|
147
|
+
}
|
|
148
|
+
const threats = [...new Set(data.matches.map((m) => m.threatType))];
|
|
149
|
+
return { safe: false, threats, source: 'google-api' };
|
|
150
|
+
}
|
|
151
|
+
catch {
|
|
152
|
+
clearTimeout(timeoutId);
|
|
153
|
+
return null;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Check URL safety.
|
|
158
|
+
*
|
|
159
|
+
* Flow:
|
|
160
|
+
* 1. If SAFE_BROWSING_API_KEY (or passed apiKey) is set, race Google API vs 2s timeout.
|
|
161
|
+
* Falls back to local blocklist on timeout or error.
|
|
162
|
+
* 2. Without an API key, use local heuristic blocklist only.
|
|
163
|
+
*
|
|
164
|
+
* @param url The URL to check
|
|
165
|
+
* @param apiKey Google Safe Browsing API key (optional). Falls back to SAFE_BROWSING_API_KEY env var.
|
|
166
|
+
*/
|
|
167
|
+
export async function checkUrlSafety(url, apiKey) {
|
|
168
|
+
const key = apiKey ?? process.env.SAFE_BROWSING_API_KEY;
|
|
169
|
+
if (key) {
|
|
170
|
+
// Race: Google API with 2s timeout, fallback to local
|
|
171
|
+
const timeoutResult = checkLocalBlocklist(url);
|
|
172
|
+
const googleResult = await Promise.race([
|
|
173
|
+
checkGoogleSafeBrowsing(url, key),
|
|
174
|
+
new Promise((resolve) => setTimeout(() => resolve(null), 2000)),
|
|
175
|
+
]);
|
|
176
|
+
if (googleResult !== null)
|
|
177
|
+
return googleResult;
|
|
178
|
+
// API timed out or errored — use local blocklist result
|
|
179
|
+
return timeoutResult;
|
|
180
|
+
}
|
|
181
|
+
// No API key — local blocklist only
|
|
182
|
+
return checkLocalBlocklist(url);
|
|
183
|
+
}
|
|
@@ -1,17 +1,29 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Source credibility scoring — lightweight, zero dependencies.
|
|
2
|
+
* Source credibility scoring — lightweight, zero dependencies, no network calls.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* -
|
|
6
|
-
* -
|
|
7
|
-
* -
|
|
4
|
+
* Actively investigates domain signals from the URL itself:
|
|
5
|
+
* - TLD trust score
|
|
6
|
+
* - HTTPS enforcement
|
|
7
|
+
* - Domain structure analysis
|
|
8
|
+
* - Brand/platform recognition (500+ known domains)
|
|
9
|
+
* - Content platform detection
|
|
10
|
+
*
|
|
11
|
+
* Score breakdown (0–100):
|
|
12
|
+
* TLD weight 0–20
|
|
13
|
+
* HTTPS 0–10
|
|
14
|
+
* Known domain 0–40
|
|
15
|
+
* Structure 0–15
|
|
16
|
+
* Platform 0–15
|
|
8
17
|
*/
|
|
9
18
|
export interface SourceCredibility {
|
|
10
|
-
tier: 'official' | '
|
|
11
|
-
|
|
19
|
+
tier: 'official' | 'established' | 'community' | 'new' | 'suspicious';
|
|
20
|
+
score: number;
|
|
12
21
|
label: string;
|
|
22
|
+
signals: string[];
|
|
23
|
+
warnings: string[];
|
|
13
24
|
}
|
|
14
25
|
/**
|
|
15
26
|
* Assess the credibility of a source URL.
|
|
27
|
+
* Fully synchronous — no network calls.
|
|
16
28
|
*/
|
|
17
29
|
export declare function getSourceCredibility(url: string): SourceCredibility;
|
|
@@ -1,83 +1,584 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Source credibility scoring — lightweight, zero dependencies.
|
|
2
|
+
* Source credibility scoring — lightweight, zero dependencies, no network calls.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* -
|
|
6
|
-
* -
|
|
7
|
-
* -
|
|
4
|
+
* Actively investigates domain signals from the URL itself:
|
|
5
|
+
* - TLD trust score
|
|
6
|
+
* - HTTPS enforcement
|
|
7
|
+
* - Domain structure analysis
|
|
8
|
+
* - Brand/platform recognition (500+ known domains)
|
|
9
|
+
* - Content platform detection
|
|
10
|
+
*
|
|
11
|
+
* Score breakdown (0–100):
|
|
12
|
+
* TLD weight 0–20
|
|
13
|
+
* HTTPS 0–10
|
|
14
|
+
* Known domain 0–40
|
|
15
|
+
* Structure 0–15
|
|
16
|
+
* Platform 0–15
|
|
8
17
|
*/
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
'
|
|
14
|
-
'
|
|
15
|
-
'
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// TLD trust map: points (0–20)
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
const TLD_TRUST = {
|
|
22
|
+
'.gov': 20, '.edu': 20, '.mil': 20,
|
|
23
|
+
'.org': 14, '.net': 12, '.com': 12, '.io': 11,
|
|
24
|
+
'.co': 10, '.us': 10, '.uk': 10, '.ca': 10, '.au': 10,
|
|
25
|
+
'.de': 10, '.fr': 10, '.jp': 10, '.br': 10, '.in': 10,
|
|
26
|
+
'.eu': 11, '.int': 15,
|
|
27
|
+
'.info': 8, '.biz': 7, '.me': 8, '.tv': 8, '.app': 10,
|
|
28
|
+
'.dev': 10, '.ai': 10, '.tech': 8, '.page': 8,
|
|
29
|
+
'.blog': 7, '.news': 8, '.media': 8, '.press': 8,
|
|
30
|
+
'.shop': 7, '.store': 7, '.online': 7, '.site': 6,
|
|
31
|
+
'.website': 6, '.space': 5, '.club': 5, '.pro': 7,
|
|
32
|
+
// Low-trust freebies
|
|
33
|
+
'.tk': 1, '.ml': 1, '.ga': 1, '.cf': 1, '.gq': 1,
|
|
34
|
+
'.xyz': 4, '.top': 3, '.loan': 2, '.click': 3, '.link': 4,
|
|
35
|
+
'.win': 2, '.bid': 2, '.download': 2, '.racing': 2, '.review': 4,
|
|
36
|
+
'.cc': 3, '.pw': 3, '.men': 2, '.party': 2, '.stream': 3,
|
|
37
|
+
};
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
// Suspicious TLDs (high-risk freebies used in phishing)
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
const SUSPICIOUS_TLDS = new Set(['.tk', '.ml', '.ga', '.cf', '.gq', '.win', '.bid', '.men', '.party', '.loan']);
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
// Official TLDs
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
const OFFICIAL_TLDS = new Set(['.gov', '.edu', '.mil', '.int']);
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
// Official hostnames (beyond .gov/.edu/.mil TLD)
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
const OFFICIAL_DOMAINS = new Set([
|
|
16
50
|
// International organisations
|
|
17
51
|
'who.int', 'un.org', 'worldbank.org', 'imf.org', 'oecd.org', 'europa.eu',
|
|
52
|
+
'nato.int', 'wto.org', 'unicef.org', 'unhcr.org', 'icrc.org',
|
|
53
|
+
// Academic / research
|
|
54
|
+
'arxiv.org', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov', 'jstor.org',
|
|
55
|
+
'nature.com', 'science.org', 'cell.com', 'nejm.org', 'bmj.com',
|
|
56
|
+
'thelancet.com', 'plos.org', 'springer.com', 'elsevier.com',
|
|
57
|
+
'scholar.google.com', 'researchgate.net', 'semanticscholar.org',
|
|
58
|
+
'acm.org', 'ieee.org',
|
|
18
59
|
// Official tech documentation
|
|
19
60
|
'docs.python.org', 'developer.mozilla.org', 'nodejs.org', 'rust-lang.org',
|
|
20
61
|
'docs.microsoft.com', 'learn.microsoft.com', 'developer.apple.com',
|
|
21
62
|
'developer.android.com', 'php.net', 'ruby-lang.org', 'golang.org', 'go.dev',
|
|
22
|
-
// Health
|
|
23
|
-
'
|
|
24
|
-
'webmd.com', 'medlineplus.gov',
|
|
63
|
+
// Health
|
|
64
|
+
'mayoclinic.org', 'clevelandclinic.org', 'webmd.com',
|
|
25
65
|
// Standards / specs
|
|
26
|
-
'w3.org', 'ietf.org', 'rfc-editor.org', 'iso.org',
|
|
66
|
+
'w3.org', 'ietf.org', 'rfc-editor.org', 'iso.org', 'ecma-international.org',
|
|
67
|
+
]);
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
// Established domains (score bonus 40 pts) — 500+ entries
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
const ESTABLISHED_DOMAINS = new Set([
|
|
72
|
+
// ── Major Tech ──────────────────────────────────────────────────────────
|
|
73
|
+
'google.com', 'apple.com', 'microsoft.com', 'amazon.com', 'meta.com',
|
|
74
|
+
'netflix.com', 'spotify.com', 'adobe.com', 'salesforce.com', 'oracle.com',
|
|
75
|
+
'ibm.com', 'intel.com', 'nvidia.com', 'amd.com', 'qualcomm.com',
|
|
76
|
+
'cisco.com', 'vmware.com', 'sap.com', 'servicenow.com', 'workday.com',
|
|
77
|
+
'zoom.us', 'slack.com', 'dropbox.com', 'box.com', 'atlassian.com',
|
|
78
|
+
'jira.atlassian.com', 'confluence.atlassian.com',
|
|
79
|
+
'twilio.com', 'sendgrid.com', 'mailchimp.com', 'hubspot.com',
|
|
80
|
+
'zendesk.com', 'intercom.com', 'freshworks.com', 'docusign.com',
|
|
81
|
+
'okta.com', 'auth0.com', 'cloudflare.com', 'fastly.com', 'akamai.com',
|
|
82
|
+
'digitalocean.com', 'linode.com', 'vultr.com',
|
|
83
|
+
'datadog.com', 'newrelic.com', 'splunk.com', 'elastic.co',
|
|
84
|
+
'mongodb.com', 'redis.io', 'postgresql.org', 'mysql.com',
|
|
85
|
+
'docker.com', 'kubernetes.io', 'helm.sh',
|
|
86
|
+
'terraform.io', 'ansible.com', 'chef.io', 'puppet.com',
|
|
87
|
+
'heroku.com', 'render.com', 'railway.app', 'fly.io',
|
|
88
|
+
'supabase.com', 'planetscale.com', 'neon.tech', 'fauna.com',
|
|
89
|
+
'firebase.google.com', 'expo.dev',
|
|
90
|
+
'openai.com', 'anthropic.com', 'cohere.com', 'huggingface.co',
|
|
91
|
+
'stability.ai', 'midjourney.com', 'replicate.com',
|
|
92
|
+
'figma.com', 'sketch.com', 'invisionapp.com', 'zeplin.io',
|
|
93
|
+
'notion.so', 'airtable.com', 'monday.com', 'asana.com', 'clickup.com',
|
|
94
|
+
'trello.com', 'basecamp.com', 'linear.app', 'shortcut.com',
|
|
95
|
+
'postman.com', 'insomnia.rest', 'swagger.io',
|
|
96
|
+
'sentry.io', 'bugsnag.com', 'rollbar.com',
|
|
97
|
+
'segment.com', 'mixpanel.com', 'amplitude.com', 'heap.io',
|
|
98
|
+
'looker.com', 'tableau.com', 'powerbi.microsoft.com',
|
|
99
|
+
'snowflake.com', 'databricks.com', 'dbt.com', 'fivetran.com', 'airbyte.com',
|
|
100
|
+
'vercel.com', 'netlify.com',
|
|
101
|
+
// ── Cloud / Hosting ──────────────────────────────────────────────────────
|
|
102
|
+
'aws.amazon.com', 'cloud.google.com', 'azure.microsoft.com',
|
|
103
|
+
'docs.aws.amazon.com', 'console.aws.amazon.com',
|
|
104
|
+
// ── Developer Ecosystems ──────────────────────────────────────────────────
|
|
105
|
+
'github.com', 'gitlab.com', 'bitbucket.org', 'sourcehut.com',
|
|
106
|
+
'stackoverflow.com', 'superuser.com', 'serverfault.com',
|
|
107
|
+
'npmjs.com', 'pypi.org', 'crates.io', 'packagist.org', 'rubygems.org',
|
|
108
|
+
'nuget.org', 'pub.dev', 'hex.pm', 'opam.ocaml.org',
|
|
109
|
+
'docs.rs', 'crates.io', 'pkg.go.dev',
|
|
110
|
+
'codepen.io', 'jsfiddle.net', 'replit.com', 'glitch.com', 'codesandbox.io',
|
|
111
|
+
'leetcode.com', 'hackerrank.com', 'codewars.com', 'exercism.org',
|
|
112
|
+
'regex101.com', 'regexr.com',
|
|
113
|
+
// ── Major Social ──────────────────────────────────────────────────────────
|
|
114
|
+
'twitter.com', 'x.com', 'reddit.com', 'linkedin.com', 'instagram.com',
|
|
115
|
+
'facebook.com', 'youtube.com', 'tiktok.com', 'snapchat.com', 'pinterest.com',
|
|
116
|
+
'tumblr.com', 'mastodon.social', 'threads.net', 'discord.com', 'discord.gg',
|
|
117
|
+
'twitch.tv', 'kick.com', 'vimeo.com', 'dailymotion.com',
|
|
118
|
+
'quora.com', 'medium.com', 'substack.com', 'hashnode.com', 'dev.to',
|
|
119
|
+
// ── Major News ────────────────────────────────────────────────────────────
|
|
120
|
+
'nytimes.com', 'washingtonpost.com', 'theguardian.com', 'bbc.com', 'bbc.co.uk',
|
|
121
|
+
'reuters.com', 'apnews.com', 'bloomberg.com', 'economist.com', 'ft.com',
|
|
122
|
+
'wsj.com', 'cnn.com', 'foxnews.com', 'msnbc.com', 'nbcnews.com',
|
|
123
|
+
'cbsnews.com', 'abcnews.go.com', 'npr.org', 'pbs.org',
|
|
124
|
+
'time.com', 'usatoday.com', 'huffpost.com', 'vox.com', 'axios.com',
|
|
125
|
+
'politico.com', 'thehill.com', 'rollcall.com', 'slate.com', 'salon.com',
|
|
126
|
+
'theatlantic.com', 'newyorker.com', 'newrepublic.com',
|
|
127
|
+
'motherjones.com', 'propublica.org', 'intercept.co',
|
|
128
|
+
'aljazeera.com', 'dw.com', 'france24.com', 'rt.com',
|
|
129
|
+
'spiegel.de', 'lemonde.fr', 'liberation.fr', 'lefigaro.fr',
|
|
130
|
+
'elpais.com', 'elmundo.es', 'repubblica.it', 'corriere.it',
|
|
131
|
+
'theglobeandmail.com', 'thestar.com', 'nationalpost.com',
|
|
132
|
+
'smh.com.au', 'theage.com.au', 'abc.net.au',
|
|
133
|
+
'timesofindia.com', 'hindustantimes.com', 'thehindu.com', 'ndtv.com',
|
|
134
|
+
'scmp.com', 'channelnewsasia.com', 'straitstimes.com',
|
|
135
|
+
'haaretz.com', 'timesofisrael.com', 'jpost.com',
|
|
136
|
+
'techcrunch.com', 'wired.com', 'arstechnica.com', 'theverge.com',
|
|
137
|
+
'engadget.com', 'gizmodo.com', 'cnet.com', 'pcmag.com', 'tomshardware.com',
|
|
138
|
+
'anandtech.com', 'macrumors.com', '9to5mac.com', '9to5google.com',
|
|
139
|
+
'androidcentral.com', 'windowscentral.com',
|
|
140
|
+
'venturebeat.com', 'businessinsider.com', 'forbes.com', 'fortune.com',
|
|
141
|
+
'inc.com', 'entrepreneur.com', 'fastcompany.com',
|
|
142
|
+
// ── Finance ───────────────────────────────────────────────────────────────
|
|
143
|
+
'chase.com', 'bankofamerica.com', 'wellsfargo.com', 'citibank.com',
|
|
144
|
+
'capitalone.com', 'usbank.com', 'tdbank.com', 'pnc.com',
|
|
145
|
+
'americanexpress.com', 'discover.com', 'synchrony.com',
|
|
146
|
+
'paypal.com', 'stripe.com', 'square.com', 'braintree.com', 'adyen.com',
|
|
147
|
+
'coinbase.com', 'binance.com', 'kraken.com', 'gemini.com', 'crypto.com',
|
|
148
|
+
'robinhood.com', 'etrade.com', 'schwab.com', 'fidelity.com',
|
|
149
|
+
'vanguard.com', 'blackrock.com', 'jpmorgan.com', 'goldmansachs.com',
|
|
150
|
+
'morganstanley.com', 'ubs.com', 'credit-suisse.com', 'hsbc.com',
|
|
151
|
+
'barclays.com', 'lloydsbank.com', 'natwest.com', 'santander.com',
|
|
152
|
+
'transferwise.com', 'wise.com', 'revolut.com', 'monzo.com',
|
|
153
|
+
'quickbooks.intuit.com', 'turbotax.intuit.com', 'mint.com', 'hrblock.com',
|
|
154
|
+
'experian.com', 'equifax.com', 'transunion.com',
|
|
155
|
+
// ── E-commerce / Retail ──────────────────────────────────────────────────
|
|
156
|
+
'amazon.com', 'ebay.com', 'etsy.com', 'walmart.com', 'target.com',
|
|
157
|
+
'bestbuy.com', 'costco.com', 'homedepot.com', 'lowes.com', 'wayfair.com',
|
|
158
|
+
'shopify.com', 'bigcommerce.com', 'woocommerce.com', 'squarespace.com',
|
|
159
|
+
'overstock.com', 'newegg.com', 'bhphotovideo.com', 'adorama.com',
|
|
160
|
+
'aliexpress.com', 'alibaba.com', 'wish.com', 'dhgate.com',
|
|
161
|
+
'zappos.com', 'nordstrom.com', 'macys.com', 'bloomingdales.com', 'gap.com',
|
|
162
|
+
'nike.com', 'adidas.com', 'reebok.com', 'underarmour.com', 'lululemon.com',
|
|
163
|
+
'ikea.com', 'crate.com', 'potterybarn.com', 'williams-sonoma.com',
|
|
164
|
+
'chewy.com', 'petco.com', 'petsmart.com',
|
|
165
|
+
'instacart.com', 'doordash.com', 'ubereats.com', 'grubhub.com',
|
|
166
|
+
'opentable.com', 'yelp.com', 'tripadvisor.com',
|
|
167
|
+
// ── Travel ────────────────────────────────────────────────────────────────
|
|
168
|
+
'booking.com', 'expedia.com', 'airbnb.com', 'vrbo.com', 'kayak.com',
|
|
169
|
+
'hotels.com', 'priceline.com', 'orbitz.com', 'travelocity.com',
|
|
170
|
+
'delta.com', 'united.com', 'aa.com', 'southwest.com', 'jetblue.com',
|
|
171
|
+
'marriott.com', 'hilton.com', 'hyatt.com', 'ihg.com', 'wyndham.com',
|
|
172
|
+
'uber.com', 'lyft.com', 'waymo.com',
|
|
173
|
+
// ── Education ─────────────────────────────────────────────────────────────
|
|
174
|
+
'coursera.org', 'edx.org', 'khanacademy.org', 'udemy.com', 'udacity.com',
|
|
175
|
+
'pluralsight.com', 'lynda.com', 'linkedin.com', 'skillshare.com',
|
|
176
|
+
'codecademy.com', 'freecodecamp.org', 'theodinproject.com',
|
|
177
|
+
'brilliant.org', 'duolingo.com', 'babbel.com', 'rosettastone.com',
|
|
178
|
+
'cambridgeinternational.org',
|
|
179
|
+
'britannica.com', 'encyclopedia.com',
|
|
180
|
+
// ── Reference / Knowledge ─────────────────────────────────────────────────
|
|
181
|
+
'wikipedia.org', 'wikimedia.org', 'wikihow.com', 'wikidata.org',
|
|
182
|
+
'imdb.com', 'rottentomatoes.com', 'metacritic.com', 'goodreads.com',
|
|
183
|
+
'nationalgeographic.com', 'smithsonianmag.com', 'history.com',
|
|
184
|
+
'wolframalpha.com', 'dictionary.com', 'merriam-webster.com',
|
|
185
|
+
'etymonline.com', 'thesaurus.com',
|
|
186
|
+
'archive.org', 'waybackmachine.org',
|
|
187
|
+
// ── Health ────────────────────────────────────────────────────────────────
|
|
188
|
+
'webmd.com', 'mayoclinic.org', 'clevelandclinic.org', 'healthline.com',
|
|
189
|
+
'medicalnewstoday.com', 'everydayhealth.com', 'drugs.com',
|
|
190
|
+
'rxlist.com', 'medscape.com', 'uptodate.com', 'emedicinehealth.com',
|
|
191
|
+
'psych.org', 'nami.org', 'betterhelp.com', 'talkspace.com',
|
|
192
|
+
// ── Legal ─────────────────────────────────────────────────────────────────
|
|
193
|
+
'law.cornell.edu', 'justia.com', 'findlaw.com', 'nolo.com', 'avvo.com',
|
|
194
|
+
'legalzoom.com', 'rocket lawyer.com',
|
|
195
|
+
// ── Government / Civic (beyond TLD) ──────────────────────────────────────
|
|
196
|
+
'gov.uk', 'gc.ca', 'australia.gov.au',
|
|
197
|
+
// ── Open Source / Misc Tech ──────────────────────────────────────────────
|
|
198
|
+
'linux.org', 'kernel.org', 'gnu.org', 'apache.org', 'mozilla.org',
|
|
199
|
+
'python.org', 'perl.org', 'haskell.org',
|
|
200
|
+
'jquery.com', 'reactjs.org', 'react.dev', 'vuejs.org', 'angular.io',
|
|
201
|
+
'svelte.dev', 'nextjs.org', 'nuxtjs.org', 'remix.run', 'astro.build',
|
|
202
|
+
'tailwindcss.com', 'getbootstrap.com', 'mui.com', 'chakra-ui.com',
|
|
203
|
+
'styled-components.com', 'emotion.sh',
|
|
204
|
+
'vitejs.dev', 'webpack.js.org', 'rollupjs.org', 'esbuild.github.io',
|
|
205
|
+
'babeljs.io', 'eslint.org', 'prettier.io', 'typescript.dev',
|
|
206
|
+
'typescriptlang.org', 'deno.com', 'deno.land', 'bun.sh',
|
|
207
|
+
'expressjs.com', 'fastify.io', 'nestjs.com', 'koajs.com', 'hapi.dev',
|
|
208
|
+
'graphql.org', 'apollographql.com', 'trpc.io', 'grpc.io',
|
|
209
|
+
'prisma.io', 'drizzle.team', 'typeorm.io', 'sequelize.org',
|
|
210
|
+
'socket.io', 'feathersjs.com',
|
|
211
|
+
'git-scm.com', 'gitkraken.com',
|
|
212
|
+
'homebrew.sh', 'brew.sh', 'chocolatey.org', 'scoop.sh', 'winget.run',
|
|
213
|
+
'ubuntu.com', 'debian.org', 'fedoraproject.org', 'archlinux.org',
|
|
214
|
+
'redhat.com', 'suse.com', 'centos.org',
|
|
215
|
+
// ── Security / Privacy ───────────────────────────────────────────────────
|
|
216
|
+
'haveibeenpwned.com', 'virustotal.com', '1password.com', 'bitwarden.com',
|
|
217
|
+
'lastpass.com', 'dashlane.com', 'nordvpn.com', 'expressvpn.com',
|
|
218
|
+
'protonmail.com', 'proton.me', 'tutanota.com', 'fastmail.com',
|
|
219
|
+
'letsencrypt.org', 'ssllabs.com', 'namecheap.com', 'godaddy.com',
|
|
220
|
+
'porkbun.com', 'cloudflare.com', 'dnschecker.org',
|
|
221
|
+
// ── Search ────────────────────────────────────────────────────────────────
|
|
222
|
+
'google.com', 'bing.com', 'yahoo.com', 'duckduckgo.com', 'brave.com',
|
|
223
|
+
'startpage.com', 'ecosia.org', 'kagi.com',
|
|
224
|
+
// ── Productivity ─────────────────────────────────────────────────────────
|
|
225
|
+
'gmail.com', 'outlook.com', 'office.com', 'office365.com',
|
|
226
|
+
'docs.google.com', 'drive.google.com', 'calendar.google.com',
|
|
227
|
+
'maps.google.com', 'translate.google.com',
|
|
228
|
+
'evernote.com', 'onenote.com', 'bear.app', 'obsidian.md',
|
|
229
|
+
'cal.com', 'calendly.com', 'doodle.com', 'when2meet.com',
|
|
230
|
+
'loom.com', 'screen.studio', 'cleanshot.com',
|
|
231
|
+
'canva.com', 'unsplash.com', 'pexels.com', 'pixabay.com',
|
|
232
|
+
'shutterstock.com', 'gettyimages.com', 'istockphoto.com',
|
|
233
|
+
'giphy.com', 'tenor.com',
|
|
234
|
+
// ── Music / Media ─────────────────────────────────────────────────────────
|
|
235
|
+
'soundcloud.com', 'bandcamp.com', 'last.fm', 'allmusic.com',
|
|
236
|
+
'discogs.com', 'genius.com', 'azlyrics.com', 'musixmatch.com',
|
|
237
|
+
'hulu.com', 'disneyplus.com', 'hbomax.com', 'max.com',
|
|
238
|
+
'peacocktv.com', 'paramount.com', 'crunchyroll.com', 'funimation.com',
|
|
239
|
+
'apple.com', 'music.apple.com',
|
|
240
|
+
// ── Gaming ───────────────────────────────────────────────────────────────
|
|
241
|
+
'steam.com', 'steampowered.com', 'epicgames.com', 'gog.com',
|
|
242
|
+
'itch.io', 'roblox.com', 'minecraft.net', 'ea.com',
|
|
243
|
+
'activision.com', 'blizzard.com', 'battle.net', 'ubisoft.com',
|
|
244
|
+
'nintendo.com', 'playstation.com', 'xbox.com',
|
|
245
|
+
'ign.com', 'gamespot.com', 'kotaku.com', 'polygon.com',
|
|
246
|
+
'pcgamer.com', 'rockpapershotgun.com',
|
|
247
|
+
// ── Science / Research ───────────────────────────────────────────────────
|
|
248
|
+
'nasa.gov', 'esa.int', 'noaa.gov', 'nist.gov', 'usgs.gov',
|
|
249
|
+
'epa.gov', 'energy.gov', 'nsf.gov',
|
|
250
|
+
'acs.org', 'aps.org', 'aip.org', 'ams.org',
|
|
251
|
+
'newsweek.com', 'scientificamerican.com', 'popularmechanics.com',
|
|
252
|
+
'livescience.com', 'space.com', 'phys.org', 'sciencedaily.com',
|
|
253
|
+
'technologyreview.com',
|
|
254
|
+
// ── Mapping / Location ────────────────────────────────────────────────────
|
|
255
|
+
'openstreetmap.org', 'mapbox.com', 'here.com', 'waze.com',
|
|
256
|
+
'zillow.com', 'redfin.com', 'realtor.com', 'trulia.com', 'apartments.com',
|
|
257
|
+
// ── HR / Recruiting ───────────────────────────────────────────────────────
|
|
258
|
+
'indeed.com', 'glassdoor.com', 'monster.com', 'ziprecruiter.com',
|
|
259
|
+
'careerbuilder.com', 'simplyhired.com', 'flexjobs.com', 'remote.com',
|
|
260
|
+
'levels.fyi', 'teamblind.com', 'angellist.com', 'wellfound.com',
|
|
261
|
+
// ── Misc established ─────────────────────────────────────────────────────
|
|
262
|
+
'hbr.org', 'mckinsey.com', 'bcg.com', 'bain.com', 'deloitte.com',
|
|
263
|
+
'pwc.com', 'kpmg.com', 'ey.com', 'accenture.com',
|
|
264
|
+
'gartner.com', 'idc.com', 'forrester.com',
|
|
265
|
+
'ted.com', 'masterclass.com',
|
|
266
|
+
'change.org', 'gofundme.com', 'kickstarter.com', 'indiegogo.com',
|
|
267
|
+
'patreon.com', 'ko-fi.com', 'buymeacoffee.com',
|
|
268
|
+
'webpeel.dev',
|
|
27
269
|
]);
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
'
|
|
33
|
-
'
|
|
34
|
-
'
|
|
35
|
-
|
|
36
|
-
'
|
|
37
|
-
'
|
|
38
|
-
|
|
39
|
-
'
|
|
40
|
-
'
|
|
41
|
-
|
|
42
|
-
'
|
|
43
|
-
|
|
44
|
-
'
|
|
270
|
+
// ---------------------------------------------------------------------------
|
|
271
|
+
// Community / content platforms — user content hosted on established infra
|
|
272
|
+
// ---------------------------------------------------------------------------
|
|
273
|
+
const COMMUNITY_PLATFORMS = new Map([
|
|
274
|
+
['github.com', 'Community Content on GitHub'],
|
|
275
|
+
['github.io', 'Personal Site on GitHub Pages'],
|
|
276
|
+
['gitlab.com', 'Community Content on GitLab'],
|
|
277
|
+
['medium.com', 'Article on Medium'],
|
|
278
|
+
['substack.com', 'Newsletter on Substack'],
|
|
279
|
+
['hashnode.com', 'Blog on Hashnode'],
|
|
280
|
+
['dev.to', 'Article on DEV Community'],
|
|
281
|
+
['wordpress.com', 'Blog on WordPress'],
|
|
282
|
+
['blogspot.com', 'Blog on Blogger'],
|
|
283
|
+
['blogger.com', 'Blog on Blogger'],
|
|
284
|
+
['tumblr.com', 'Blog on Tumblr'],
|
|
285
|
+
['weebly.com', 'Site on Weebly'],
|
|
286
|
+
['wix.com', 'Site on Wix'],
|
|
287
|
+
['squarespace.com', 'Site on Squarespace'],
|
|
288
|
+
['webflow.io', 'Site on Webflow'],
|
|
289
|
+
['vercel.app', 'Deployed Project on Vercel'],
|
|
290
|
+
['netlify.app', 'Deployed Project on Netlify'],
|
|
291
|
+
['pages.dev', 'Deployed Project on Cloudflare Pages'],
|
|
292
|
+
['web.app', 'Firebase Hosted App'],
|
|
293
|
+
['firebaseapp.com', 'Firebase Hosted App'],
|
|
294
|
+
['herokuapp.com', 'App on Heroku'],
|
|
295
|
+
['replit.dev', 'Project on Replit'],
|
|
296
|
+
['glitch.me', 'Project on Glitch'],
|
|
297
|
+
['codesandbox.io', 'Sandbox on CodeSandbox'],
|
|
298
|
+
['stackblitz.com', 'Project on StackBlitz'],
|
|
299
|
+
['codepen.io', 'Pen on CodePen'],
|
|
300
|
+
['jsfiddle.net', 'Fiddle on JSFiddle'],
|
|
301
|
+
['notion.site', 'Notion Page'],
|
|
302
|
+
['gitbook.io', 'Docs on GitBook'],
|
|
303
|
+
['gitbook.com', 'Docs on GitBook'],
|
|
304
|
+
['readthedocs.io', 'Docs on Read the Docs'],
|
|
305
|
+
['readthedocs.org', 'Docs on Read the Docs'],
|
|
306
|
+
['reddit.com', 'Community Discussion on Reddit'],
|
|
307
|
+
['news.ycombinator.com', 'Discussion on Hacker News'],
|
|
308
|
+
['quora.com', 'Answer on Quora'],
|
|
309
|
+
['stackoverflow.com', 'Answer on Stack Overflow'],
|
|
310
|
+
['stackexchange.com', 'Answer on Stack Exchange'],
|
|
311
|
+
['producthunt.com', 'Launch on Product Hunt'],
|
|
312
|
+
['indiehackers.com', 'Post on Indie Hackers'],
|
|
313
|
+
['hackernoon.com', 'Article on HackerNoon'],
|
|
314
|
+
['lobste.rs', 'Discussion on Lobsters'],
|
|
315
|
+
['lobsters.rs', 'Discussion on Lobsters'],
|
|
316
|
+
['twitter.com', 'Post on X (Twitter)'],
|
|
317
|
+
['x.com', 'Post on X (Twitter)'],
|
|
318
|
+
['linkedin.com', 'Post on LinkedIn'],
|
|
319
|
+
['youtube.com', 'Video on YouTube'],
|
|
320
|
+
['vimeo.com', 'Video on Vimeo'],
|
|
321
|
+
['twitch.tv', 'Stream on Twitch'],
|
|
322
|
+
['soundcloud.com', 'Audio on SoundCloud'],
|
|
323
|
+
['bandcamp.com', 'Music on Bandcamp'],
|
|
324
|
+
['pinterest.com', 'Pin on Pinterest'],
|
|
325
|
+
['instagram.com', 'Post on Instagram'],
|
|
326
|
+
['tiktok.com', 'Video on TikTok'],
|
|
45
327
|
]);
|
|
328
|
+
// ---------------------------------------------------------------------------
|
|
329
|
+
// Brand-category labels for established domains
|
|
330
|
+
// ---------------------------------------------------------------------------
|
|
331
|
+
const DOMAIN_CATEGORY = {
|
|
332
|
+
// Tech
|
|
333
|
+
'google.com': 'Established Technology Company',
|
|
334
|
+
'apple.com': 'Established Technology Company',
|
|
335
|
+
'microsoft.com': 'Established Technology Company',
|
|
336
|
+
'amazon.com': 'Established E-commerce & Cloud Platform',
|
|
337
|
+
'meta.com': 'Established Technology Company',
|
|
338
|
+
'netflix.com': 'Established Streaming Service',
|
|
339
|
+
'spotify.com': 'Established Music Streaming Service',
|
|
340
|
+
'openai.com': 'Established AI Research Company',
|
|
341
|
+
'anthropic.com': 'Established AI Research Company',
|
|
342
|
+
'github.com': 'Established Developer Platform',
|
|
343
|
+
'gitlab.com': 'Established Developer Platform',
|
|
344
|
+
'stackoverflow.com': 'Established Developer Q&A Platform',
|
|
345
|
+
'npmjs.com': 'Established Package Registry',
|
|
346
|
+
'pypi.org': 'Established Package Registry',
|
|
347
|
+
'docker.com': 'Established Container Platform',
|
|
348
|
+
'vercel.com': 'Established Hosting Platform',
|
|
349
|
+
'netlify.com': 'Established Hosting Platform',
|
|
350
|
+
'cloudflare.com': 'Established CDN & Security Provider',
|
|
351
|
+
'figma.com': 'Established Design Platform',
|
|
352
|
+
'notion.so': 'Established Productivity Platform',
|
|
353
|
+
'slack.com': 'Established Business Communication Platform',
|
|
354
|
+
'zoom.us': 'Established Video Communication Platform',
|
|
355
|
+
'adobe.com': 'Established Creative Software Company',
|
|
356
|
+
// News
|
|
357
|
+
'nytimes.com': 'Established News Organization',
|
|
358
|
+
'washingtonpost.com': 'Established News Organization',
|
|
359
|
+
'theguardian.com': 'Established News Organization',
|
|
360
|
+
'bbc.com': 'Established News Organization',
|
|
361
|
+
'bbc.co.uk': 'Established News Organization',
|
|
362
|
+
'reuters.com': 'Established News Agency',
|
|
363
|
+
'apnews.com': 'Established News Agency',
|
|
364
|
+
'bloomberg.com': 'Established Financial News Organization',
|
|
365
|
+
'economist.com': 'Established News Publication',
|
|
366
|
+
'ft.com': 'Established Financial News Organization',
|
|
367
|
+
'wsj.com': 'Established Financial News Organization',
|
|
368
|
+
'cnn.com': 'Established News Organization',
|
|
369
|
+
'npr.org': 'Established Public Radio',
|
|
370
|
+
'techcrunch.com': 'Established Technology News Publication',
|
|
371
|
+
'wired.com': 'Established Technology News Publication',
|
|
372
|
+
'arstechnica.com': 'Established Technology News Publication',
|
|
373
|
+
'theverge.com': 'Established Technology News Publication',
|
|
374
|
+
// Finance
|
|
375
|
+
'paypal.com': 'Established Payment Platform',
|
|
376
|
+
'stripe.com': 'Established Payment Platform',
|
|
377
|
+
'square.com': 'Established Payment Platform',
|
|
378
|
+
'coinbase.com': 'Established Cryptocurrency Exchange',
|
|
379
|
+
'chase.com': 'Established Financial Institution',
|
|
380
|
+
'bankofamerica.com': 'Established Financial Institution',
|
|
381
|
+
'wellsfargo.com': 'Established Financial Institution',
|
|
382
|
+
// E-commerce
|
|
383
|
+
'ebay.com': 'Established E-commerce Marketplace',
|
|
384
|
+
'etsy.com': 'Established Handmade Marketplace',
|
|
385
|
+
'walmart.com': 'Established Retail Company',
|
|
386
|
+
'target.com': 'Established Retail Company',
|
|
387
|
+
'bestbuy.com': 'Established Electronics Retailer',
|
|
388
|
+
'shopify.com': 'Established E-commerce Platform',
|
|
389
|
+
// Education
|
|
390
|
+
'coursera.org': 'Established Online Education Platform',
|
|
391
|
+
'edx.org': 'Established Online Education Platform',
|
|
392
|
+
'khanacademy.org': 'Non-Profit Education Platform',
|
|
393
|
+
'udemy.com': 'Established Online Learning Marketplace',
|
|
394
|
+
'britannica.com': 'Established Reference Encyclopedia',
|
|
395
|
+
'wikipedia.org': 'Open Encyclopedia (Community Edited)',
|
|
396
|
+
// Reference
|
|
397
|
+
'archive.org': 'Established Digital Archive',
|
|
398
|
+
'wolframalpha.com': 'Established Computational Knowledge Engine',
|
|
399
|
+
'imdb.com': 'Established Movie & TV Database',
|
|
400
|
+
};
|
|
401
|
+
// ---------------------------------------------------------------------------
|
|
402
|
+
// Helpers
|
|
403
|
+
// ---------------------------------------------------------------------------
|
|
404
|
+
function extractTLD(hostname) {
|
|
405
|
+
const parts = hostname.split('.');
|
|
406
|
+
if (parts.length < 2)
|
|
407
|
+
return '';
|
|
408
|
+
return '.' + parts.slice(-1)[0];
|
|
409
|
+
}
|
|
410
|
+
function extractSLD(hostname) {
|
|
411
|
+
// Returns registrable domain (e.g. "google.com")
|
|
412
|
+
const parts = hostname.split('.');
|
|
413
|
+
if (parts.length < 2)
|
|
414
|
+
return hostname;
|
|
415
|
+
return parts.slice(-2).join('.');
|
|
416
|
+
}
|
|
417
|
+
function countSubdomains(hostname) {
|
|
418
|
+
// www.example.com → 0 subdomains (www doesn't count)
|
|
419
|
+
const stripped = hostname.replace(/^www\./, '');
|
|
420
|
+
const parts = stripped.split('.');
|
|
421
|
+
return Math.max(0, parts.length - 2);
|
|
422
|
+
}
|
|
423
|
+
// ---------------------------------------------------------------------------
|
|
424
|
+
// Main export
|
|
425
|
+
// ---------------------------------------------------------------------------
|
|
46
426
|
/**
|
|
47
427
|
* Assess the credibility of a source URL.
|
|
428
|
+
* Fully synchronous — no network calls.
|
|
48
429
|
*/
|
|
49
430
|
export function getSourceCredibility(url) {
|
|
431
|
+
const signals = [];
|
|
432
|
+
const warnings = [];
|
|
433
|
+
let score = 0;
|
|
434
|
+
// ── Parse URL ─────────────────────────────────────────────────────────────
|
|
435
|
+
let parsedUrl;
|
|
50
436
|
try {
|
|
51
|
-
|
|
52
|
-
// Check official TLDs
|
|
53
|
-
for (const tld of OFFICIAL_TLDS) {
|
|
54
|
-
if (hostname.endsWith(tld)) {
|
|
55
|
-
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
// Check known official hostnames
|
|
59
|
-
if (OFFICIAL_HOSTNAMES.has(hostname)) {
|
|
60
|
-
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
61
|
-
}
|
|
62
|
-
// Check parent domain (e.g. en.wikipedia.org → wikipedia.org)
|
|
63
|
-
const parts = hostname.split('.');
|
|
64
|
-
if (parts.length > 2) {
|
|
65
|
-
const parentDomain = parts.slice(-2).join('.');
|
|
66
|
-
if (OFFICIAL_HOSTNAMES.has(parentDomain)) {
|
|
67
|
-
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
68
|
-
}
|
|
69
|
-
if (VERIFIED_HOSTNAMES.has(parentDomain)) {
|
|
70
|
-
return { tier: 'verified', stars: 2, label: 'VERIFIED' };
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
// Check known verified hostnames
|
|
74
|
-
if (VERIFIED_HOSTNAMES.has(hostname)) {
|
|
75
|
-
return { tier: 'verified', stars: 2, label: 'VERIFIED' };
|
|
76
|
-
}
|
|
77
|
-
// Everything else
|
|
78
|
-
return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
|
|
437
|
+
parsedUrl = new URL(url);
|
|
79
438
|
}
|
|
80
439
|
catch {
|
|
81
|
-
return {
|
|
440
|
+
return {
|
|
441
|
+
tier: 'suspicious',
|
|
442
|
+
score: 0,
|
|
443
|
+
label: 'Invalid URL — Cannot Assess',
|
|
444
|
+
signals: [],
|
|
445
|
+
warnings: ['URL could not be parsed'],
|
|
446
|
+
};
|
|
447
|
+
}
|
|
448
|
+
const protocol = parsedUrl.protocol; // 'https:' or 'http:'
|
|
449
|
+
const rawHostname = parsedUrl.hostname.toLowerCase();
|
|
450
|
+
const hostname = rawHostname.replace(/^www\./, '');
|
|
451
|
+
const tld = extractTLD(hostname);
|
|
452
|
+
const sld = extractSLD(hostname); // e.g. "google.com"
|
|
453
|
+
const subdomainCount = countSubdomains(rawHostname);
|
|
454
|
+
// ── 1. HTTPS check (0–10 pts) ─────────────────────────────────────────────
|
|
455
|
+
if (protocol === 'https:') {
|
|
456
|
+
score += 10;
|
|
457
|
+
signals.push('HTTPS enforced');
|
|
458
|
+
}
|
|
459
|
+
else {
|
|
460
|
+
warnings.push('HTTP only — no encryption');
|
|
461
|
+
}
|
|
462
|
+
// ── 2. TLD trust (0–20 pts) ───────────────────────────────────────────────
|
|
463
|
+
const tldScore = TLD_TRUST[tld] ?? 5;
|
|
464
|
+
score += tldScore;
|
|
465
|
+
if (tldScore >= 18) {
|
|
466
|
+
signals.push(`Trusted TLD (${tld})`);
|
|
467
|
+
}
|
|
468
|
+
else if (tldScore <= 3) {
|
|
469
|
+
warnings.push(`High-risk TLD (${tld}) — commonly used in phishing`);
|
|
470
|
+
}
|
|
471
|
+
// ── 3. Official TLD shortcut ──────────────────────────────────────────────
|
|
472
|
+
if (OFFICIAL_TLDS.has(tld) || OFFICIAL_DOMAINS.has(hostname) || OFFICIAL_DOMAINS.has(sld)) {
|
|
473
|
+
const category = DOMAIN_CATEGORY[hostname] ?? DOMAIN_CATEGORY[sld] ?? 'Official Source';
|
|
474
|
+
return {
|
|
475
|
+
tier: 'official',
|
|
476
|
+
score: Math.min(100, score + 40 + 15),
|
|
477
|
+
label: tld === '.gov' ? 'Official Government Source' :
|
|
478
|
+
tld === '.edu' ? 'Official Educational Institution' :
|
|
479
|
+
tld === '.mil' ? 'Official Military Source' :
|
|
480
|
+
tld === '.int' ? 'International Organization' :
|
|
481
|
+
category,
|
|
482
|
+
signals: [...signals, 'Official domain verified', `Trusted TLD (${tld})`].filter((v, i, a) => a.indexOf(v) === i),
|
|
483
|
+
warnings,
|
|
484
|
+
};
|
|
485
|
+
}
|
|
486
|
+
// ── 4. Domain structure (0–15 pts) ────────────────────────────────────────
|
|
487
|
+
if (subdomainCount === 0) {
|
|
488
|
+
score += 15;
|
|
489
|
+
signals.push('Clean domain structure');
|
|
490
|
+
}
|
|
491
|
+
else if (subdomainCount === 1) {
|
|
492
|
+
score += 10;
|
|
493
|
+
signals.push('Standard subdomain structure');
|
|
494
|
+
}
|
|
495
|
+
else if (subdomainCount === 2) {
|
|
496
|
+
score += 5;
|
|
497
|
+
}
|
|
498
|
+
else {
|
|
499
|
+
// 3+ subdomains — possible phishing pattern
|
|
500
|
+
score += 0;
|
|
501
|
+
warnings.push(`Excessive subdomains (${subdomainCount}) — potential phishing indicator`);
|
|
502
|
+
}
|
|
503
|
+
// ── 5 & 6. Known domain + Community platform (mutually exclusive bonus) ──
|
|
504
|
+
// Community platform detection — user content on a known hosting platform.
|
|
505
|
+
// When the domain is a community platform, it gets the platform bonus (15 pts)
|
|
506
|
+
// but NOT the established domain bonus (they're conceptually different tiers).
|
|
507
|
+
const communityLabel = COMMUNITY_PLATFORMS.get(hostname) ?? COMMUNITY_PLATFORMS.get(sld);
|
|
508
|
+
const isEstablished = ESTABLISHED_DOMAINS.has(hostname) || ESTABLISHED_DOMAINS.has(sld);
|
|
509
|
+
if (communityLabel) {
|
|
510
|
+
// Platform bonus only — user content hosted on verified infra
|
|
511
|
+
score += 15;
|
|
512
|
+
signals.push(`Hosted on verified platform (${sld})`);
|
|
513
|
+
}
|
|
514
|
+
else if (isEstablished) {
|
|
515
|
+
// Full established domain bonus
|
|
516
|
+
score += 40;
|
|
517
|
+
signals.push('Recognized established domain');
|
|
518
|
+
}
|
|
519
|
+
// ── 7. Suspicious TLD ─────────────────────────────────────────────────────
|
|
520
|
+
if (SUSPICIOUS_TLDS.has(tld)) {
|
|
521
|
+
score = Math.min(score, 15); // Cap at suspicious tier
|
|
522
|
+
warnings.push('Domain uses a free TLD associated with fraud');
|
|
523
|
+
}
|
|
524
|
+
// ── 8. Phishing keyword detection ─────────────────────────────────────────
|
|
525
|
+
const phishingKeywords = ['paypal-', 'apple-', 'google-', 'microsoft-', 'amazon-',
|
|
526
|
+
'bank-', 'login-', 'signin-', 'secure-', 'verify-', 'account-', 'update-',
|
|
527
|
+
'support-', 'helpdesk-', '-login', '-signin', '-secure', '-verify', '-account',
|
|
528
|
+
'paypal.', 'apple.', 'google.', 'microsoft.', 'amazon.'];
|
|
529
|
+
const suspiciousPattern = phishingKeywords.some(kw => hostname.includes(kw) && !isEstablished && !communityLabel);
|
|
530
|
+
if (suspiciousPattern) {
|
|
531
|
+
score = Math.min(score, 19);
|
|
532
|
+
warnings.push('Domain contains impersonation keywords — potential phishing');
|
|
533
|
+
}
|
|
534
|
+
// ── Clamp score ───────────────────────────────────────────────────────────
|
|
535
|
+
score = Math.max(0, Math.min(100, score));
|
|
536
|
+
// ── Tier assignment ───────────────────────────────────────────────────────
|
|
537
|
+
let tier;
|
|
538
|
+
if (score >= 90)
|
|
539
|
+
tier = 'official';
|
|
540
|
+
else if (score >= 60)
|
|
541
|
+
tier = 'established';
|
|
542
|
+
else if (score >= 40)
|
|
543
|
+
tier = 'community';
|
|
544
|
+
else if (score >= 20)
|
|
545
|
+
tier = 'new';
|
|
546
|
+
else
|
|
547
|
+
tier = 'suspicious';
|
|
548
|
+
// ── Label generation ──────────────────────────────────────────────────────
|
|
549
|
+
let label;
|
|
550
|
+
if (communityLabel) {
|
|
551
|
+
label = communityLabel;
|
|
552
|
+
}
|
|
553
|
+
else if (isEstablished) {
|
|
554
|
+
label = DOMAIN_CATEGORY[hostname] ?? DOMAIN_CATEGORY[sld] ?? labelFromTier(tier, hostname, tld);
|
|
555
|
+
}
|
|
556
|
+
else {
|
|
557
|
+
label = labelFromTier(tier, hostname, tld);
|
|
558
|
+
}
|
|
559
|
+
return { tier, score, label, signals, warnings };
|
|
560
|
+
}
|
|
561
|
+
// ---------------------------------------------------------------------------
|
|
562
|
+
// Generate a useful fallback label based on tier + domain context
|
|
563
|
+
// ---------------------------------------------------------------------------
|
|
564
|
+
function labelFromTier(tier, _hostname, tld) {
|
|
565
|
+
switch (tier) {
|
|
566
|
+
case 'official':
|
|
567
|
+
return 'Official Source';
|
|
568
|
+
case 'established':
|
|
569
|
+
return tld === '.org' ? 'Established Organization' :
|
|
570
|
+
tld === '.net' ? 'Established Network Service' :
|
|
571
|
+
tld === '.io' ? 'Established Tech Service' :
|
|
572
|
+
'Established Website';
|
|
573
|
+
case 'community':
|
|
574
|
+
return 'Community or Independent Website';
|
|
575
|
+
case 'new':
|
|
576
|
+
return 'Small or Recently Established Website';
|
|
577
|
+
case 'suspicious':
|
|
578
|
+
return SUSPICIOUS_TLDS.has(tld)
|
|
579
|
+
? `Free Domain TLD (${tld}) — Exercise Caution`
|
|
580
|
+
: 'Unrecognized Domain — Exercise Caution';
|
|
581
|
+
default:
|
|
582
|
+
return 'Unknown Domain — Limited Verification Available';
|
|
82
583
|
}
|
|
83
584
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -42,6 +42,9 @@ export type SearchFallbackResult = {
|
|
|
42
42
|
};
|
|
43
43
|
export declare function searchFallback(..._args: any[]): Promise<SearchFallbackResult | null>;
|
|
44
44
|
export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS, type PeelTLSOptions, type PeelTLSResult } from './core/peel-tls.js';
|
|
45
|
+
export { sanitizeForLLM, type SanitizeResult } from './core/prompt-guard.js';
|
|
46
|
+
export { getSourceCredibility, type SourceCredibility } from './core/source-credibility.js';
|
|
47
|
+
export { checkUrlSafety, type SafeBrowsingResult } from './core/safe-browsing.js';
|
|
45
48
|
/**
|
|
46
49
|
* Fetch and extract content from a URL
|
|
47
50
|
*
|
package/dist/index.js
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
*/
|
|
6
6
|
import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from './core/fetcher.js';
|
|
7
7
|
import { createContext, normalizeOptions, handleYouTube, fetchContent, detectContentType, parseContent, postProcess, finalize, buildResult, } from './core/pipeline.js';
|
|
8
|
+
import { checkUrlSafety } from './core/safe-browsing.js';
|
|
8
9
|
export * from './types.js';
|
|
9
10
|
export { getDomainExtractor, extractDomainData } from './core/domain-extractors.js';
|
|
10
11
|
export { crawl } from './core/crawler.js';
|
|
@@ -47,6 +48,9 @@ export async function searchFallback(..._args) {
|
|
|
47
48
|
}
|
|
48
49
|
}
|
|
49
50
|
export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS } from './core/peel-tls.js';
|
|
51
|
+
export { sanitizeForLLM } from './core/prompt-guard.js';
|
|
52
|
+
export { getSourceCredibility } from './core/source-credibility.js';
|
|
53
|
+
export { checkUrlSafety } from './core/safe-browsing.js';
|
|
50
54
|
/**
|
|
51
55
|
* Fetch and extract content from a URL
|
|
52
56
|
*
|
|
@@ -66,16 +70,34 @@ export { peelTLSFetch, isPeelTLSAvailable, shutdownPeelTLS } from './core/peel-t
|
|
|
66
70
|
export async function peel(url, options = {}) {
|
|
67
71
|
const ctx = createContext(url, options);
|
|
68
72
|
normalizeOptions(ctx);
|
|
73
|
+
// Safe Browsing check — runs before any HTTP request, non-blocking
|
|
74
|
+
const sbResult = await checkUrlSafety(url, process.env.SAFE_BROWSING_API_KEY);
|
|
75
|
+
ctx.safeBrowsingResult = sbResult;
|
|
76
|
+
if (!sbResult.safe) {
|
|
77
|
+
const threatList = sbResult.threats.join(', ');
|
|
78
|
+
ctx.warnings.push(`⚠️ URL flagged by Safe Browsing: ${threatList}`);
|
|
79
|
+
}
|
|
69
80
|
const ytResult = await handleYouTube(ctx);
|
|
70
|
-
if (ytResult)
|
|
71
|
-
|
|
81
|
+
if (ytResult) {
|
|
82
|
+
// Attach safe browsing to YouTube results too
|
|
83
|
+
return {
|
|
84
|
+
...ytResult,
|
|
85
|
+
safeBrowsing: sbResult,
|
|
86
|
+
...(ytResult.warnings || ctx.warnings.length > 0
|
|
87
|
+
? { warnings: [...(ytResult.warnings ?? []), ...ctx.warnings.filter(w => !ytResult.warnings?.includes(w))] }
|
|
88
|
+
: {}),
|
|
89
|
+
};
|
|
90
|
+
}
|
|
72
91
|
try {
|
|
73
92
|
await fetchContent(ctx);
|
|
74
93
|
detectContentType(ctx);
|
|
75
94
|
await parseContent(ctx);
|
|
76
95
|
await postProcess(ctx);
|
|
77
96
|
await finalize(ctx);
|
|
78
|
-
|
|
97
|
+
const result = buildResult(ctx);
|
|
98
|
+
// Attach safe browsing result
|
|
99
|
+
result.safeBrowsing = sbResult;
|
|
100
|
+
return result;
|
|
79
101
|
}
|
|
80
102
|
catch (error) {
|
|
81
103
|
// Clean up browser resources on error
|
|
@@ -212,16 +212,16 @@ export function createSearchRouter(authStore) {
|
|
|
212
212
|
}
|
|
213
213
|
}
|
|
214
214
|
// Add credibility scores and sort by trustworthiness
|
|
215
|
-
const tierOrder = { official: 0,
|
|
215
|
+
const tierOrder = { official: 0, established: 1, community: 2, new: 3, suspicious: 4 };
|
|
216
216
|
results = results
|
|
217
217
|
.map(r => {
|
|
218
218
|
const cred = getSourceCredibility(r.url);
|
|
219
219
|
return { ...r, credibility: cred };
|
|
220
220
|
})
|
|
221
221
|
.sort((a, b) => {
|
|
222
|
-
const aTier = tierOrder[a.credibility?.tier || '
|
|
223
|
-
const bTier = tierOrder[b.credibility?.tier || '
|
|
224
|
-
return aTier - bTier; // Official first, then
|
|
222
|
+
const aTier = tierOrder[a.credibility?.tier || 'new'] ?? 3;
|
|
223
|
+
const bTier = tierOrder[b.credibility?.tier || 'new'] ?? 3;
|
|
224
|
+
return aTier - bTier; // Official first, then established, community, new, suspicious
|
|
225
225
|
})
|
|
226
226
|
.map((r, i) => ({ ...r, rank: i + 1 }));
|
|
227
227
|
data.web = results;
|
package/dist/types.d.ts
CHANGED
|
@@ -339,6 +339,28 @@ export interface PeelResult {
|
|
|
339
339
|
rawTokenEstimate?: number;
|
|
340
340
|
/** Token savings percentage compared to raw HTML (how much cheaper WebPeel is) */
|
|
341
341
|
tokenSavingsPercent?: number;
|
|
342
|
+
/** Trust & safety assessment of the fetched content */
|
|
343
|
+
trust?: {
|
|
344
|
+
/** Source credibility tier */
|
|
345
|
+
source: {
|
|
346
|
+
tier: 'official' | 'established' | 'community' | 'new' | 'suspicious';
|
|
347
|
+
score: number;
|
|
348
|
+
label: string;
|
|
349
|
+
signals?: string[];
|
|
350
|
+
warnings?: string[];
|
|
351
|
+
};
|
|
352
|
+
/** Prompt injection scan result */
|
|
353
|
+
contentSafety: {
|
|
354
|
+
clean: boolean;
|
|
355
|
+
injectionDetected: boolean;
|
|
356
|
+
detectedPatterns: string[];
|
|
357
|
+
strippedCount: number;
|
|
358
|
+
};
|
|
359
|
+
/** Overall trust score 0-1 (composite of source + content safety) */
|
|
360
|
+
score: number;
|
|
361
|
+
/** Human-readable safety warnings */
|
|
362
|
+
warnings: string[];
|
|
363
|
+
};
|
|
342
364
|
/** Content chunks (when chunk option is enabled) */
|
|
343
365
|
chunks?: Array<{
|
|
344
366
|
index: number;
|
|
@@ -350,6 +372,12 @@ export interface PeelResult {
|
|
|
350
372
|
startOffset: number;
|
|
351
373
|
endOffset: number;
|
|
352
374
|
}>;
|
|
375
|
+
/** Safe Browsing check result */
|
|
376
|
+
safeBrowsing?: {
|
|
377
|
+
safe: boolean;
|
|
378
|
+
threats: string[];
|
|
379
|
+
source: 'google-api' | 'local-blocklist' | 'unchecked';
|
|
380
|
+
};
|
|
353
381
|
}
|
|
354
382
|
export interface PageMetadata {
|
|
355
383
|
/** Meta description */
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.60",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|