@q32/signal-scanner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts ADDED
@@ -0,0 +1,1366 @@
1
+ import { Parser } from "htmlparser2";
2
+ import { binaryRules, binaryStringRules, cssRules, decodedArtifactRules, htmlRules, htmlTechnologyRules, scriptCompositeRules, scriptRiskRules, sourceCodeRules, urlRules } from "./rules/packs";
3
+ import type { RuleDefinition, RuleScoreModel, ScoreTag } from "./rules/types";
4
+
5
+ export type ContentKind = "html" | "javascript" | "css" | "json" | "svg" | "text" | "unknown" | "archive" | "executable";
6
+ export type Severity = "info" | "low" | "medium" | "high" | "critical";
7
+ export type Confidence = "low" | "medium" | "high";
8
+ export type Disposition = "allow" | "warn" | "review" | "block";
9
+
10
+ export interface FetchRecord {
11
+ url: string;
12
+ finalUrl: string;
13
+ status: number;
14
+ contentType: string;
15
+ contentLength: number | null;
16
+ redirectChain: string[];
17
+ tls?: {
18
+ protocol?: string;
19
+ issuer?: string;
20
+ subject?: string;
21
+ validFrom?: string;
22
+ validTo?: string;
23
+ };
24
+ }
25
+
26
+ export interface ScannerSource {
27
+ url?: string;
28
+ finalUrl?: string;
29
+ filename?: string;
30
+ contentType?: string | null;
31
+ originUrl?: string;
32
+ tls?: TlsMetadata;
33
+ }
34
+
35
+ export interface TlsMetadata {
36
+ authorized?: boolean;
37
+ authorizationError?: string | null;
38
+ issuer?: string;
39
+ subject?: string;
40
+ validFrom?: string;
41
+ validTo?: string;
42
+ fingerprint256?: string;
43
+ serialNumber?: string;
44
+ }
45
+
46
+ export interface ArtifactRecord {
47
+ source: string;
48
+ artifactType: string;
49
+ parentOffset: number;
50
+ depth: number;
51
+ sha256?: string;
52
+ text: string;
53
+ }
54
+
55
+ export interface Finding {
56
+ id: string;
57
+ severity: Severity;
58
+ confidence: Confidence;
59
+ score: number;
60
+ scoreModel: RuleScoreModel;
61
+ title: string;
62
+ description: string;
63
+ locationType: "url" | "html" | "javascript" | "css" | "source" | "binary" | "decoded_artifact" | "aggregate";
64
+ locationValue: string;
65
+ ruleId: string;
66
+ metadata: Record<string, unknown>;
67
+ }
68
+
69
+ export interface ExtractedUrl {
70
+ raw: string;
71
+ normalized: string;
72
+ registrableDomain: string | null;
73
+ relation: "same-origin" | "same-site" | "subdomain" | "off-site" | "unknown";
74
+ scheme: string;
75
+ destinationType: "http" | "https" | "ip" | "private" | "localhost" | "url-shortener" | "other";
76
+ flags: string[];
77
+ }
78
+
79
+ export interface ScannerReport {
80
+ contentKind: ContentKind;
81
+ findings: Finding[];
82
+ urls: ExtractedUrl[];
83
+ artifacts: ArtifactRecord[];
84
+ score: number;
85
+ disposition: Disposition;
86
+ counters: Record<string, number>;
87
+ }
88
+
89
+ export interface Scanner {
90
+ feed(chunk: Uint8Array): Finding[];
91
+ finish(): ScannerReport;
92
+ }
93
+
94
+ interface ScannerOptions {
95
+ source?: ScannerSource;
96
+ maxWindowChars?: number;
97
+ maxDecodedBytes?: number;
98
+ maxDecodeDepth?: number;
99
+ }
100
+
101
+ interface ScannerState {
102
+ source: ScannerSource;
103
+ contentKind: ContentKind;
104
+ textWindow: string;
105
+ scanCarry: string;
106
+ absoluteOffset: number;
107
+ line: number;
108
+ column: number;
109
+ findings: Finding[];
110
+ findingKeys: Set<string>;
111
+ urls: Map<string, ExtractedUrl>;
112
+ artifacts: ArtifactRecord[];
113
+ counters: Record<string, number>;
114
+ forms: FormState[];
115
+ externalScripts: ExtractedUrl[];
116
+ inScript: boolean;
117
+ currentScript: string;
118
+ binaryHeaderScanned: boolean;
119
+ }
120
+
121
+ interface FormState {
122
+ action: string | null;
123
+ method: string | null;
124
+ hasPassword: boolean;
125
+ hasPayment: boolean;
126
+ hiddenTarget: boolean;
127
+ }
128
+
129
+ const DEFAULT_WINDOW_CHARS = 64 * 1024;
130
+ const DEFAULT_CARRY_CHARS = 4096;
131
+ const DEFAULT_MAX_DECODED_BYTES = 128 * 1024;
132
+ const DEFAULT_MAX_DECODE_DEPTH = 2;
133
+
134
+ export function createScanner(options: ScannerOptions = {}): Scanner {
135
+ const state: ScannerState = {
136
+ source: options.source ?? {},
137
+ contentKind: detectContentKind({
138
+ contentType: options.source?.contentType ?? null,
139
+ filename: options.source?.filename ?? options.source?.url,
140
+ firstBytes: new Uint8Array()
141
+ }),
142
+ textWindow: "",
143
+ scanCarry: "",
144
+ absoluteOffset: 0,
145
+ line: 1,
146
+ column: 1,
147
+ findings: [],
148
+ findingKeys: new Set(),
149
+ urls: new Map(),
150
+ artifacts: [],
151
+ counters: {},
152
+ forms: [],
153
+ externalScripts: [],
154
+ inScript: false,
155
+ currentScript: "",
156
+ binaryHeaderScanned: false
157
+ };
158
+ const maxWindowChars = options.maxWindowChars ?? DEFAULT_WINDOW_CHARS;
159
+ const maxDecodedBytes = options.maxDecodedBytes ?? DEFAULT_MAX_DECODED_BYTES;
160
+ const maxDecodeDepth = options.maxDecodeDepth ?? DEFAULT_MAX_DECODE_DEPTH;
161
+ if (state.source.url) addUrl(state, state.source.url);
162
+ if (state.source.finalUrl && state.source.finalUrl !== state.source.url) addUrl(state, state.source.finalUrl);
163
+ scanRedirectContext(state);
164
+ scanTlsContext(state);
165
+
166
+ return {
167
+ feed(chunk: Uint8Array): Finding[] {
168
+ if (!chunk.byteLength) return [];
169
+ if (state.absoluteOffset === 0) {
170
+ state.contentKind = detectContentKind({
171
+ contentType: state.source.contentType ?? null,
172
+ filename: state.source.filename ?? state.source.url,
173
+ firstBytes: chunk
174
+ });
175
+ scanBinaryHeader(state, chunk);
176
+ }
177
+ const before = state.findings.length;
178
+ const text = decodeText(chunk);
179
+ const scanTextInput = state.scanCarry + text;
180
+ state.textWindow = trimWindow(state.textWindow + text, maxWindowChars);
181
+ scanText(state, scanTextInput, state.absoluteOffset - byteLength(state.scanCarry), 0, maxDecodedBytes, maxDecodeDepth);
182
+ updatePosition(state, text);
183
+ state.scanCarry = trimWindow(scanTextInput, DEFAULT_CARRY_CHARS);
184
+ state.absoluteOffset += chunk.byteLength;
185
+ state.counters.bytes_seen = state.absoluteOffset;
186
+ return state.findings.slice(before);
187
+ },
188
+ finish(): ScannerReport {
189
+ finalizeAggregateRules(state);
190
+ const score = scoreFindings(state.findings);
191
+ return {
192
+ contentKind: state.contentKind,
193
+ findings: dedupeFindings(state.findings),
194
+ urls: [...state.urls.values()],
195
+ artifacts: state.artifacts,
196
+ score,
197
+ disposition: dispositionForScore(score),
198
+ counters: { ...state.counters }
199
+ };
200
+ }
201
+ };
202
+ }
203
+
204
+ export function detectContentKind(input: {
205
+ contentType?: string | null;
206
+ filename?: string | null;
207
+ firstBytes?: Uint8Array;
208
+ }): ContentKind {
209
+ const first = input.firstBytes ?? new Uint8Array();
210
+ if (hasElfMagic(first)) return "executable";
211
+
212
+ const contentType = (input.contentType ?? "").toLowerCase().split(";")[0].trim();
213
+ if (contentType.includes("html")) return "html";
214
+ if (contentType.includes("javascript") || contentType.includes("ecmascript")) return "javascript";
215
+ if (contentType === "text/css") return "css";
216
+ if (contentType.includes("json")) return "json";
217
+ if (contentType.includes("svg")) return "svg";
218
+ if (contentType.startsWith("text/")) return "text";
219
+ if (contentType.includes("zip") || contentType.includes("tar") || contentType.includes("gzip") || contentType.includes("x-7z") || contentType.includes("rar")) return "archive";
220
+
221
+ const filename = (input.filename ?? "").toLowerCase().split("?")[0];
222
+ if (/\.(html?|xhtml)$/.test(filename)) return "html";
223
+ if (/\.(mjs|cjs|js|jsx|ts|tsx)$/.test(filename)) return "javascript";
224
+ if (/\.css$/.test(filename)) return "css";
225
+ if (/\.json$/.test(filename)) return "json";
226
+ if (/\.svg$/.test(filename)) return "svg";
227
+ if (/\.(zip|jar|war|tar|tgz|gz|7z|rar)$/.test(filename)) return "archive";
228
+
229
+ if (first.length >= 4 && first[0] === 0x50 && first[1] === 0x4b) return "archive";
230
+ if (first.length >= 2 && first[0] === 0x1f && first[1] === 0x8b) return "archive";
231
+ if (first.length >= 6 && first[0] === 0x37 && first[1] === 0x7a && first[2] === 0xbc && first[3] === 0xaf && first[4] === 0x27 && first[5] === 0x1c) return "archive";
232
+ const text = decodeText(first.slice(0, 512)).trimStart();
233
+ if (/^<!doctype html/i.test(text) || /^<html[\s>]/i.test(text)) return "html";
234
+ if (/^<svg[\s>]/i.test(text)) return "svg";
235
+ if (/^\s*(?:import|export|const|let|var|function)\b/.test(text)) return "javascript";
236
+ if (/^\s*(?:@import|[.#]?[a-z0-9_-]+\s*\{[^}]+:)/i.test(text)) return "css";
237
+ if (/^[\[{]/.test(text)) return "json";
238
+ return text ? "text" : "unknown";
239
+ }
240
+
241
+ export function normalizeUrl(raw: string, base?: string): ExtractedUrl | null {
242
+ try {
243
+ const url = new URL(raw, base);
244
+ url.hash = "";
245
+ const normalized = url.toString();
246
+ const host = url.hostname.toLowerCase();
247
+ const registrableDomain = registrableDomainFor(host);
248
+ const baseHost = base ? new URL(base).hostname.toLowerCase() : "";
249
+ const baseDomain = baseHost ? registrableDomainFor(baseHost) : null;
250
+ const flags: string[] = [];
251
+ if (host.startsWith("xn--") || host.includes(".xn--")) flags.push("punycode");
252
+ if (isIpLiteral(host)) flags.push("ip_literal");
253
+ if (isPrivateHost(host)) flags.push("private_or_localhost");
254
+ if (isUrlShortener(host)) flags.push("url_shortener");
255
+ // Credential/account/banking lure terms in the path (multilingual + a few
256
+ // leetspeak spellings). These only CONVICT when the host is also suspicious
257
+ // (see credential_path_on_suspicious_host), so the breadth is safe.
258
+ if (/(?:log[i1]n|sign[\s_-]?[i1]n|signon|account|verify|verif|wallet|checkout|payment|download|payload|secure|update|confirm|recover|unlock|billing|webscr|kunden|compte|cliente?s|conta|codigo|banking)/i.test(url.pathname)) flags.push("suspicious_path_terms");
259
+ if (isSuspiciousTld(host)) flags.push("suspicious_tld");
260
+ if (/(?:\/|^)(?:payload|installer|setup|invoice|verify|wallet|checkout|payment)(?:[\/_.-]|$)|\.(?:exe|scr|msi|dmg|pkg|apk|zip)$/i.test(url.pathname)) {
261
+ flags.push("download_like_path");
262
+ }
263
+ if (isMalwareDownloadLikePath(url.pathname)) flags.push("malware_download_like_path");
264
+ if (isSharedHostingSubdomain(host, registrableDomain)) flags.push("shared_hosting_subdomain");
265
+ if (isGeneratedHostLabel(host, registrableDomain)) flags.push("generated_host_label");
266
+ return {
267
+ raw,
268
+ normalized,
269
+ registrableDomain,
270
+ relation: relationFor(host, registrableDomain, baseHost, baseDomain),
271
+ scheme: url.protocol.replace(":", ""),
272
+ destinationType: destinationTypeFor(url, host),
273
+ flags
274
+ };
275
+ } catch {
276
+ return null;
277
+ }
278
+ }
279
+
280
+ function scanText(
281
+ state: ScannerState,
282
+ text: string,
283
+ offset: number,
284
+ depth: number,
285
+ maxDecodedBytes: number,
286
+ maxDecodeDepth: number
287
+ ): void {
288
+ collectUrls(state, text);
289
+ scanPageIntentSignals(state, text);
290
+ if (state.contentKind === "html" || /<html|<script|<form|<iframe/i.test(text)) scanHtml(state, text);
291
+ if (state.contentKind === "javascript" || state.inScript || /<script\b/i.test(text)) {
292
+ scanJavaScript(state, text);
293
+ }
294
+ if (state.contentKind === "css" || /(?:display\s*:\s*none|opacity\s*:\s*0|@import|url\()/i.test(text)) scanCss(state, text);
295
+ if (state.contentKind === "executable" || likelyBinaryStrings(text)) scanBinaryStrings(state, text);
296
+ if (shouldScanSourceText(state)) scanSourceText(state, text);
297
+ if (depth < maxDecodeDepth) decodeAndRescan(state, text, offset, depth, maxDecodedBytes, maxDecodeDepth);
298
+ }
299
+
300
+ function scanHtml(state: ScannerState, text: string): void {
301
+ // Tokenize with htmlparser2 rather than hand-rolled regexes: it correctly
302
+ // handles malformed markup, entity-encoded attribute values (e.g.
303
+ // href="java&#115;cript:…"), quoting tricks, and tags split oddly — all of
304
+ // which trivially evade `<tag ...>` regexes. The scanner already streams in
305
+ // overlapping windows, so we parse this window in one pass; the inflated
306
+ // counts from the carry overlap and finding dedup behave exactly as before.
307
+ let scriptBody = "";
308
+ let scriptDepth = 0;
309
+ const parser = new Parser(
310
+ {
311
+ onopentag(name, attribs) {
312
+ const attrs = new Map<string, string>();
313
+ for (const key of Object.keys(attribs)) attrs.set(key.toLowerCase(), attribs[key]);
314
+ if (name === "script") {
315
+ scriptDepth += 1;
316
+ scriptBody = "";
317
+ }
318
+ handleOpenTag(state, name, attrs);
319
+ },
320
+ ontext(chunk) {
321
+ if (scriptDepth > 0) scriptBody += chunk;
322
+ },
323
+ onclosetag(name) {
324
+ if (name === "script" && scriptDepth > 0) {
325
+ scriptDepth -= 1;
326
+ state.inScript = false;
327
+ if (scriptBody) scanJavaScript(state, scriptBody);
328
+ scriptBody = "";
329
+ }
330
+ }
331
+ },
332
+ { decodeEntities: true, lowerCaseTags: true, lowerCaseAttributeNames: true }
333
+ );
334
+ parser.write(text);
335
+ parser.end();
336
+ // A <script> whose closing tag falls beyond this window: still scan what we
337
+ // captured (regexes would have missed the whole block), and remember we're
338
+ // mid-script so the next chunk keeps scanning JS.
339
+ if (scriptDepth > 0 && scriptBody) {
340
+ scanJavaScript(state, scriptBody);
341
+ state.inScript = true;
342
+ }
343
+
344
+ if (/wp-content|wp-includes/i.test(text)) {
345
+ addRuleFinding(state, htmlTechnologyRules.wordpress_surface_reference, pageUrl(state) ?? "html", {});
346
+ }
347
+ scanTechnologyFingerprint(state, text, pageUrl(state) ?? "html");
348
+ if (/(?:login|sign in|password|account|verify|checkout|payment)/i.test(text)) increment(state, "brand_login_or_payment_language");
349
+ recordContentBrandMentions(state, text);
350
+ }
351
+
352
+ // Per-tag dispatch, shared by the htmlparser2 open-tag callback. `name` is
353
+ // already lowercased; `attrs` keys are lowercased with entity-decoded values.
354
+ function handleOpenTag(state: ScannerState, name: string, attrs: Map<string, string>): void {
355
+ if (name === "script") {
356
+ const src = attrs.get("src");
357
+ if (src) {
358
+ increment(state, "html.script_src");
359
+ addUrl(state, src);
360
+ const normalized = normalizeUrl(src, pageUrl(state));
361
+ // Ad/analytics/tag-manager scripts are expected on ordinary ad-funded
362
+ // sites (news, blogs) and are never a phishing exfil channel, so they
363
+ // don't count toward "suspicious external scripts".
364
+ if (normalized?.relation === "off-site" && !isAdOrAnalyticsHost(normalized.normalized)) state.externalScripts.push(normalized);
365
+ if (pageUrl(state)?.startsWith("https://") && normalized?.scheme === "http") addRuleFinding(state, htmlRules.mixed_content_script, normalized.normalized, {});
366
+ scanTechnologyFingerprint(state, src, normalized?.normalized ?? src);
367
+ } else {
368
+ increment(state, "inline_script");
369
+ }
370
+ state.inScript = true;
371
+ }
372
+ if (name === "form") {
373
+ increment(state, "html.form");
374
+ state.forms.push({
375
+ action: attrs.get("action") ?? null,
376
+ method: attrs.get("method")?.toLowerCase() ?? "get",
377
+ hasPassword: false,
378
+ hasPayment: false,
379
+ hiddenTarget: /display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0/i.test(attrs.get("style") ?? "")
380
+ });
381
+ }
382
+ if (name === "input") {
383
+ const type = (attrs.get("type") ?? "").toLowerCase();
384
+ const field = `${attrs.get("name") ?? ""} ${attrs.get("autocomplete") ?? ""}`.toLowerCase();
385
+ const isPassword = type === "password" || field.includes("password");
386
+ // A password field anywhere on the page is credential capture, even when
387
+ // it isn't wrapped in a <form> — PIN/OTP grids and JS-submit kits routinely
388
+ // place inputs outside any form and exfiltrate via fetch.
389
+ if (isPassword) increment(state, "page_password_input");
390
+ if (state.forms.length) {
391
+ increment(state, "html.input");
392
+ const form = state.forms[state.forms.length - 1];
393
+ if (isPassword) form.hasPassword = true;
394
+ if (/(?:cc-|card|cvv|cvc|expiry|payment)/.test(`${type} ${field}`)) form.hasPayment = true;
395
+ }
396
+ }
397
+ if (["a", "link", "img", "iframe"].includes(name)) {
398
+ increment(state, `html.${name}`);
399
+ const src = attrs.get("href") ?? attrs.get("src");
400
+ if (src) addUrl(state, src);
401
+ if (name === "iframe" && src && hiddenAttrs(attrs)) {
402
+ const normalized = normalizeUrl(src, pageUrl(state));
403
+ if (normalized?.relation === "off-site" && hasRiskyUrlFlags(normalized)) addRuleFinding(state, htmlRules.hidden_iframe_off_origin, normalized.normalized, {});
404
+ }
405
+ }
406
+ if (name === "base") {
407
+ const href = attrs.get("href");
408
+ if (href) {
409
+ increment(state, "html.base_href");
410
+ addUrl(state, href);
411
+ }
412
+ }
413
+ if (name === "link" && /canonical/i.test(attrs.get("rel") ?? "")) {
414
+ increment(state, "html.canonical");
415
+ }
416
+ if (name === "meta" && /generator/i.test(attrs.get("name") ?? "") && /wordpress/i.test(attrs.get("content") ?? "")) {
417
+ addRuleFinding(state, htmlTechnologyRules.wordpress_surface_reference, pageUrl(state) ?? "html", { generator: attrs.get("content") ?? "" });
418
+ }
419
+ if (name === "meta" && /refresh/i.test(attrs.get("http-equiv") ?? "")) {
420
+ increment(state, "html.meta_refresh");
421
+ const content = attrs.get("content") ?? "";
422
+ const target = content.match(/url\s*=\s*([^;]+)/i)?.[1]?.trim();
423
+ if (target) {
424
+ const normalized = normalizeUrl(target, pageUrl(state));
425
+ if (normalized?.relation === "off-site") addRuleFinding(state, htmlRules.meta_refresh_external, normalized.normalized, {});
426
+ }
427
+ }
428
+ }
429
+
430
+ // Count how often each known brand is named in the page content. Combined with a
431
+ // credential field on a non-brand domain (see finalizeAggregateRules) this is the
432
+ // core phishing tell — a page that looks like Brand X but isn't Brand X's site.
433
+ function recordContentBrandMentions(state: ScannerState, text: string): void {
434
+ // The page's claimed identity: brand named in the <title>. Legit sites title
435
+ // themselves with their OWN brand (or none we track), never a brand they
436
+ // aren't — so this is the high-precision impersonation signal.
437
+ const title = text.match(/<title\b[^>]*>([\s\S]{0,200}?)<\/title>/i)?.[1] ?? "";
438
+ for (const brand of PHISH_BRANDS) {
439
+ let hits = 0;
440
+ for (const kw of brand.keywords) {
441
+ if (kw.length < 4) continue;
442
+ const re = new RegExp("\\b" + kw.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "gi");
443
+ const matches = text.match(re);
444
+ if (matches) hits += matches.length;
445
+ if (title && re.test(title)) state.counters["title_brand:" + brand.brand] = 1;
446
+ }
447
+ if (hits) state.counters["content_brand:" + brand.brand] = (state.counters["content_brand:" + brand.brand] ?? 0) + hits;
448
+ }
449
+ }
450
+
451
+ function scanPageIntentSignals(state: ScannerState, text: string): void {
452
+ const normalized = text.replace(/\\\//g, "/");
453
+ if (hasCryptoWalletLoginLanguage(normalized)) increment(state, "content.crypto_wallet_login_language");
454
+ if (hasCryptoTradingLandingLanguage(normalized)) increment(state, "content.crypto_trading_landing_language");
455
+ if (hasLoginUiImageReference(normalized)) increment(state, "content.login_ui_image_reference");
456
+ if (hasSeoTrademarkStuffing(normalized)) increment(state, "content.seo_trademark_stuffing");
457
+ }
458
+
459
+ function scanJavaScript(state: ScannerState, text: string): void {
460
+ for (const rule of scriptRiskRules) {
461
+ if (rule.pattern.test(text)) {
462
+ increment(state, rule.counter ?? rule.id);
463
+ if (!isPrimitiveJavaScriptSignal(rule.id)) addRuleFinding(state, rule, pageUrl(state) ?? "inline-script", {});
464
+ }
465
+ }
466
+ const hasExternalRequestApi = /\b(?:fetch|XMLHttpRequest|sendBeacon|WebSocket)\b/.test(text);
467
+ if (hasExternalRequestApi && hasNearbyOffSiteUrlWith(text, pageUrl(state), /(?:password|FormData|localStorage|sessionStorage|document\.cookie|navigator\.clipboard)/i)) {
468
+ addRuleFinding(state, scriptCompositeRules.credential_exfil_candidate, pageUrl(state) ?? "script", {});
469
+ }
470
+ if (hasNearbyRegexPair(text, /(?:eval|Function)\s*\(/g, /\b(?:atob|String\.fromCharCode|unescape)\b/g, 320)) {
471
+ addRuleFinding(state, scriptCompositeRules.decoded_dynamic_execution, pageUrl(state) ?? "script", {});
472
+ }
473
+ if (/\.action\s*=|setAttribute\s*\(\s*['"]action['"]/.test(text)) {
474
+ addRuleFinding(state, scriptCompositeRules.form_action_changed_by_javascript, pageUrl(state) ?? "script", {});
475
+ }
476
+ if (hasWalletSignal(text) && hasExternalRequestApi && hasNearbyOffSiteUrlWith(text, pageUrl(state), /\b(?:window\.ethereum|WalletConnect|ethereum\.request|sendBeacon|fetch|XMLHttpRequest|WebSocket)\b|\.(?:approve|permit)\s*\(|\bmethod\s*:\s*['"]eth_/i)) {
477
+ addRuleFinding(state, scriptCompositeRules.wallet_api_plus_external_beacon, pageUrl(state) ?? "script", {});
478
+ }
479
+ // Payment-card field IDENTIFIERS only — bare "card"/"payment" match UI card
480
+ // components and nav links on ordinary sites (with input listeners everywhere),
481
+ // which is a major false-positive source.
482
+ if (/(?:cc-number|cc-exp|cc-csc|cardnumber|card-number|card_number|card-expiry|cardexpiry|cvv|cvc|security-?code)/i.test(text) && /addEventListener\s*\(\s*['"](?:input|change|keyup|keydown)['"]/.test(text)) {
483
+ addRuleFinding(state, scriptCompositeRules.payment_input_event_hooks, pageUrl(state) ?? "script", {});
484
+ }
485
+ }
486
+
487
+ function scanCss(state: ScannerState, text: string): void {
488
+ if (/@import|url\(/i.test(text)) {
489
+ for (const rawUrl of extractCssUrls(text)) {
490
+ addUrl(state, rawUrl);
491
+ const normalized = normalizeUrl(rawUrl, pageUrl(state));
492
+ if (normalized?.relation === "off-site" && hasRiskyUrlFlags(normalized)) {
493
+ addRuleFinding(state, cssRules.css_imports_suspicious_domain, normalized.normalized, {});
494
+ }
495
+ }
496
+ }
497
+ if (/(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0|position\s*:\s*absolute[^}]+left\s*:\s*-\d+)/i.test(text)) {
498
+ increment(state, "hidden_css");
499
+ addRuleFinding(state, cssRules.hidden_link_cluster, pageUrl(state) ?? "css", {});
500
+ }
501
+ if (
502
+ state.forms.some((form) => form.hasPassword || form.hasPayment) &&
503
+ /\b(?:form|input|password|card|cc-|checkout|payment)\b/i.test(text) &&
504
+ /(?:position\s*:\s*(?:fixed|absolute)[^}]+(?:opacity\s*:\s*0|z-index\s*:\s*9\d{2,}|pointer-events\s*:\s*auto)|(?:opacity\s*:\s*0[^}]+position\s*:\s*(?:fixed|absolute)))/i.test(text)
505
+ ) {
506
+ increment(state, "invisible_form_overlay");
507
+ }
508
+ if (/unicode-bidi\s*:\s*bidi-override/i.test(text)) {
509
+ addRuleFinding(state, cssRules.unicode_bidi_trick, pageUrl(state) ?? "css", {});
510
+ }
511
+ }
512
+
513
+ function scanSourceText(state: ScannerState, text: string): void {
514
+ for (const rule of sourceCodeRules) {
515
+ if (rule.pattern.test(text)) {
516
+ addRuleFinding(state, rule, state.source.filename ?? state.source.url ?? "source", {});
517
+ }
518
+ }
519
+ }
520
+
521
+ function shouldScanSourceText(state: ScannerState): boolean {
522
+ if (state.source.filename) return true;
523
+ return state.contentKind === "javascript" || state.contentKind === "json" || state.contentKind === "text";
524
+ }
525
+
526
+ function isPrimitiveJavaScriptSignal(ruleId: string): boolean {
527
+ return [
528
+ "document_write_script",
529
+ "innerhtml_script_injection",
530
+ "insert_adjacent_html",
531
+ "dynamic_script_src",
532
+ "script_src_assignment",
533
+ "append_child_script",
534
+ "external_request_api_seen",
535
+ "js_location_external",
536
+ "decoder_seen",
537
+ "charcodeat_decoder_loop",
538
+ "browser_storage_or_clipboard_seen"
539
+ ].includes(ruleId);
540
+ }
541
+
542
+ function scanBinaryHeader(state: ScannerState, chunk: Uint8Array): void {
543
+ if (state.binaryHeaderScanned) return;
544
+ state.binaryHeaderScanned = true;
545
+ if (!hasElfMagic(chunk)) return;
546
+ addRuleFinding(state, binaryRules.elf_executable_magic, state.source.url ?? state.source.filename ?? "stream", {});
547
+ if (declaredNonExecutableBinary(state.source.contentType)) {
548
+ addRuleFinding(state, binaryRules.content_type_magic_mismatch, state.source.url ?? state.source.filename ?? "stream", {
549
+ content_type: state.source.contentType ?? ""
550
+ });
551
+ }
552
+ if (elfHasWritableExecutableStack(chunk)) {
553
+ addRuleFinding(state, binaryRules.elf_writable_executable_stack, state.source.url ?? state.source.filename ?? "stream", {});
554
+ }
555
+ }
556
+
557
+ function scanBinaryStrings(state: ScannerState, text: string): void {
558
+ for (const rule of binaryStringRules) {
559
+ if (rule.pattern.test(text)) {
560
+ increment(state, rule.counter ?? rule.id);
561
+ addRuleFinding(state, rule, state.source.url ?? state.source.filename ?? "binary", {});
562
+ }
563
+ }
564
+ }
565
+
566
+ function decodeAndRescan(state: ScannerState, text: string, offset: number, depth: number, maxDecodedBytes: number, maxDecodeDepth: number): void {
567
+ const candidates: Array<[string, string, number]> = [];
568
+ for (const match of text.matchAll(/[A-Za-z0-9+/]{32,}={0,2}/g)) {
569
+ const index = match.index ?? 0;
570
+ const context = text.slice(Math.max(0, index - 80), Math.min(text.length, index + match[0].length + 80));
571
+ if (/\batob\s*\(|fromBase64|Buffer\.from\s*\([^)]*base64/i.test(context)) candidates.push(["base64_decoded_string", match[0], index]);
572
+ }
573
+ for (const match of text.matchAll(/(?:\\x[0-9a-fA-F]{2}){8,}/g)) candidates.push(["javascript_hex_escapes", match[0], match.index ?? 0]);
574
+ for (const match of text.matchAll(/(?:\\u[0-9a-fA-F]{4}){6,}/g)) candidates.push(["javascript_unicode_escapes", match[0], match.index ?? 0]);
575
+ for (const match of text.matchAll(/String\.fromCharCode\s*\(([\d,\s]+)\)/g)) candidates.push(["fromcharcode_decoded_string", match[1], match.index ?? 0]);
576
+
577
+ for (const [artifactType, value, index] of candidates.slice(0, 8)) {
578
+ const decoded = decodeCandidate(artifactType, value, maxDecodedBytes);
579
+ if (!decoded || decoded.length < 8) continue;
580
+ state.artifacts.push({
581
+ source: state.source.filename ?? state.source.url ?? "stream",
582
+ artifactType,
583
+ parentOffset: offset + index,
584
+ depth: depth + 1,
585
+ text: decoded.slice(0, 4096)
586
+ });
587
+ increment(state, artifactType);
588
+ const rule = decodedArtifactRules[artifactType === "base64_decoded_string" ? "large_base64_blob" : artifactType as keyof typeof decodedArtifactRules];
589
+ addRuleFinding(state, rule, state.source.filename ?? state.source.url ?? "stream", { depth: depth + 1 });
590
+ if (depth + 1 < maxDecodeDepth) scanText(state, decoded, offset + index, depth + 1, maxDecodedBytes, maxDecodeDepth);
591
+ }
592
+ }
593
+
594
+ function finalizeAggregateRules(state: ScannerState): void {
595
+ for (const form of state.forms) {
596
+ const action = form.action ? normalizeUrl(form.action, pageUrl(state)) : null;
597
+ if (form.hasPassword && pageUrl(state)?.startsWith("http://")) {
598
+ addRuleFinding(state, htmlRules.password_form_without_https, pageUrl(state) ?? "form", {});
599
+ }
600
+ if (form.hasPassword && action?.relation === "off-site") {
601
+ addRuleFinding(state, htmlRules.credential_form_posts_off_origin, action.normalized, {});
602
+ }
603
+ if (form.hasPayment && [...state.urls.values()].some((url) => url.relation === "off-site")) {
604
+ addRuleFinding(state, htmlRules.card_fields_plus_external_script, pageUrl(state) ?? "payment-form", {});
605
+ }
606
+ if (form.hasPassword && hasSuspiciousTargetContext(state)) {
607
+ addRuleFinding(state, htmlRules.credential_form_on_suspicious_host, pageUrl(state) ?? "form", {});
608
+ }
609
+ }
610
+ // Formless credential capture (PIN/OTP grid, JS-submit) on a suspicious host.
611
+ if (incremented(state, "page_password_input") && hasSuspiciousTargetContext(state)) {
612
+ addRuleFinding(state, htmlRules.credential_form_on_suspicious_host, pageUrl(state) ?? "form", {});
613
+ }
614
+ // Brand impersonation in CONTENT: the page prominently names a brand and
615
+ // captures credentials, but is not served from that brand's own domain. This
616
+ // is the durable phishing signal — it doesn't depend on the URL or where the
617
+ // form posts (kits collect to same-host PHP just as often as off-origin).
618
+ if (state.forms.some((form) => form.hasPassword) || incremented(state, "page_password_input")) {
619
+ const host = pageHost(state);
620
+ const pageFlags = host ? normalizeUrl(pageUrl(state)!)?.flags ?? [] : [];
621
+ const throwawayHost = pageFlags.some((flag) => ["shared_hosting_subdomain", "generated_host_label", "suspicious_tld", "punycode", "ip_literal"].includes(flag));
622
+ if (host) {
623
+ for (const brand of PHISH_BRANDS) {
624
+ if (brand.allowed.test(host)) continue; // the brand's own domain — not impersonation
625
+ const inTitle = (state.counters["title_brand:" + brand.brand] ?? 0) > 0;
626
+ const mentions = state.counters["content_brand:" + brand.brand] ?? 0;
627
+ // Convict when the page CLAIMS to be the brand (brand in <title>), or the
628
+ // brand dominates the content on a throwaway host (where no legitimate
629
+ // brand login lives). Reputable hosts that merely reference other brands
630
+ // (app-store/social links) don't qualify.
631
+ if (inTitle || (mentions >= 3 && throwawayHost)) {
632
+ addRuleFinding(state, htmlRules.brand_impersonation_content, pageUrl(state) ?? "site", { brand: brand.brand, mentions, in_title: inTitle });
633
+ break;
634
+ }
635
+ }
636
+ }
637
+ }
638
+ const externalScripts = [...state.findings].filter((finding) => finding.ruleId === "external_script_from_unrelated_domain").length;
639
+ const hasSensitivePageContext = state.forms.some((form) => form.hasPassword || form.hasPayment);
640
+ if (hasSensitivePageContext) {
641
+ for (const script of state.externalScripts) {
642
+ addRuleFinding(state, htmlRules.external_script_from_unrelated_domain, script.normalized, { relation: script.relation });
643
+ }
644
+ }
645
+ const riskyExternalScripts = hasSensitivePageContext ? state.externalScripts.length : externalScripts;
646
+ if (riskyExternalScripts >= 5 && hasSensitivePageContext) {
647
+ addRuleFinding(state, htmlRules.excessive_external_scripts_on_login_page, pageUrl(state) ?? "site", { external_scripts: riskyExternalScripts });
648
+ }
649
+ if ([...state.urls.values()].some((url) => url.flags.includes("punycode")) && incremented(state, "brand_login_or_payment_language")) {
650
+ addRuleFinding(state, htmlRules.login_page_with_punycode_links, pageUrl(state) ?? "site", {});
651
+ }
652
+ if (incremented(state, "content.login_ui_image_reference")) {
653
+ addRuleFinding(state, htmlRules.credential_ui_rendered_as_image, pageUrl(state) ?? "site", {});
654
+ }
655
+ // Crypto trigger-word signals only count on an already-suspicious host. They
656
+ // were built for shared-hosted crypto phishing; on reputable hosts (e.g. a
657
+ // LinkedIn login page that merely contains "wallet"/"swap" in bundled JS) they
658
+ // are pure noise.
659
+ if (hasSuspiciousTargetContext(state)) {
660
+ if (incremented(state, "content.crypto_wallet_login_language")) {
661
+ addRuleFinding(state, htmlRules.crypto_wallet_login_language, pageUrl(state) ?? "site", {});
662
+ }
663
+ if (incremented(state, "content.crypto_trading_landing_language")) {
664
+ addRuleFinding(state, htmlRules.crypto_trading_landing_language, pageUrl(state) ?? "site", {});
665
+ }
666
+ }
667
+ if (incremented(state, "content.seo_trademark_stuffing")) {
668
+ addRuleFinding(state, htmlRules.seo_trademark_stuffing, pageUrl(state) ?? "site", {});
669
+ }
670
+ }
671
+
672
+ export function scoreFindings(findings: Finding[]): number {
673
+ let score = 0;
674
+ const groups = new Map<string, Finding[]>();
675
+ const tags = new Set<ScoreTag>();
676
+ for (const finding of findings) {
677
+ const group = groups.get(finding.ruleId);
678
+ if (group) group.push(finding);
679
+ else groups.set(finding.ruleId, [finding]);
680
+ for (const tag of finding.scoreModel.tags) tags.add(tag);
681
+ }
682
+ // Within a maxGroup only the single strongest member counts — rules that
683
+ // observe the same behaviour different ways (eval / new Function / runtime
684
+ // eval) must not stack and inflate a legit JS-heavy page.
685
+ const maxGroupScores = new Map<string, number>();
686
+ for (const group of groups.values()) {
687
+ const model = group[0].scoreModel;
688
+ const repeats = Math.min(group.length - 1, model.maxRepeats ?? 0);
689
+ const ruleScore = model.base + repeats * model.base * (model.repeatMultiplier ?? 0);
690
+ if (model.maxGroup) {
691
+ maxGroupScores.set(model.maxGroup, Math.max(maxGroupScores.get(model.maxGroup) ?? 0, ruleScore));
692
+ } else {
693
+ score += ruleScore;
694
+ }
695
+ }
696
+ for (const groupScore of maxGroupScores.values()) score += groupScore;
697
+ score *= scoreMultiplier(tags);
698
+ return Math.max(0, Math.min(100, Math.round(score)));
699
+ }
700
+
701
+ function scoreMultiplier(tags: Set<ScoreTag>): number {
702
+ let multiplier = 1;
703
+ if (tags.has("credential") && (tags.has("hosting") || tags.has("redirect") || tags.has("url"))) multiplier *= 1.2;
704
+ if ((tags.has("payment") || tags.has("wallet")) && (tags.has("exfiltration") || tags.has("redirect"))) multiplier *= 1.15;
705
+ if (tags.has("decoded") && (tags.has("script") || tags.has("exfiltration"))) multiplier *= 1.15;
706
+ if (tags.has("binary") && tags.has("url")) multiplier *= 1.1;
707
+ return multiplier;
708
+ }
709
+
710
+ export function dispositionForScore(score: number): Disposition {
711
+ if (score >= 75) return "block";
712
+ if (score >= 50) return "review";
713
+ if (score >= 25) return "warn";
714
+ return "allow";
715
+ }
716
+
717
+ function collectUrls(state: ScannerState, text: string): void {
718
+ for (const match of text.matchAll(/\bhttps?:\/\/[^\s"'<>`\\)]+/gi)) addUrl(state, match[0].replace(/[.,;:]+$/, ""));
719
+ }
720
+
721
+ function urlsInText(text: string, base?: string): ExtractedUrl[] {
722
+ const urls: ExtractedUrl[] = [];
723
+ for (const match of text.matchAll(/\bhttps?:\/\/[^\s"'<>`\\)]+/gi)) {
724
+ const normalized = normalizeUrl(match[0].replace(/[.,;:]+$/, ""), base);
725
+ if (normalized) urls.push(normalized);
726
+ }
727
+ return urls;
728
+ }
729
+
730
+ function hasNearbyOffSiteUrlWith(text: string, base: string | undefined, signal: RegExp): boolean {
731
+ for (const match of text.matchAll(/\bhttps?:\/\/[^\s"'<>`\\)]+/gi)) {
732
+ const normalized = normalizeUrl(match[0].replace(/[.,;:]+$/, ""), base);
733
+ if (!normalized || (normalized.relation !== "off-site" && !(normalized.relation === "unknown" && !!normalized.registrableDomain))) continue;
734
+ const index = match.index ?? 0;
735
+ const context = text.slice(Math.max(0, index - 160), Math.min(text.length, index + match[0].length + 160));
736
+ if (/\b(?:fetch|XMLHttpRequest|sendBeacon|WebSocket)\b/.test(context) && signal.test(context)) return true;
737
+ }
738
+ return false;
739
+ }
740
+
741
+ function hasWalletSignal(text: string): boolean {
742
+ return /\b(?:window\.ethereum|WalletConnect|ethereum\.request)\b/i.test(text) || /\.(?:approve|permit)\s*\(/i.test(text) || /\bmethod\s*:\s*['"]eth_/i.test(text);
743
+ }
744
+
745
+ function hasNearbyRegexPair(text: string, left: RegExp, right: RegExp, distance: number): boolean {
746
+ const leftPositions = [...text.matchAll(left)].map((match) => match.index ?? 0);
747
+ const rightPositions = [...text.matchAll(right)].map((match) => match.index ?? 0);
748
+ return leftPositions.some((leftIndex) => rightPositions.some((rightIndex) => Math.abs(leftIndex - rightIndex) <= distance));
749
+ }
750
+
751
+ function hasRiskyUrlFlags(url: ExtractedUrl): boolean {
752
+ return url.flags.some((flag) => ["punycode", "ip_literal", "private_or_localhost", "url_shortener", "suspicious_tld", "suspicious_path_terms", "malware_download_like_path"].includes(flag));
753
+ }
754
+
755
+ function hasCryptoWalletLoginLanguage(text: string): boolean {
756
+ // Require a strong, crypto-specific term (not bare "crypto"/"ledger", which
757
+ // collide with normal sites) paired with credential/wallet-connect intent.
758
+ return /\b(?:metamask|walletconnect|usdt|tether|trust\s+wallet|seed\s+phrase|connect\s+wallet|coinbase|binance|web3)\b/i.test(text) &&
759
+ /\b(?:login|log\s*in|sign\s*in|connect|password|securely|access|restore|import)\b/i.test(text);
760
+ }
761
+
762
+ function hasCryptoTradingLandingLanguage(text: string): boolean {
763
+ // Crypto-native vocabulary. Generic finance words (token, exchange, trade,
764
+ // market, liquidity) are excluded — they appear on ordinary sites and in
765
+ // minified JS (CSRF/OAuth "token"). The emitted finding is additionally gated
766
+ // on a suspicious host (see finalizeAggregateRules) so reputable sites that
767
+ // merely mention crypto don't trip it.
768
+ const matches = text.match(/\b(?:crypto|defi|dexs?|solana|swap|blockchain|wallet|web3|metamask|walletconnect|usdt|tether|coinbase|binance|jupiter|airdrop|staking|seed\s+phrase)\b/gi) ?? [];
769
+ return new Set(matches.map((match) => match.toLowerCase())).size >= 2;
770
+ }
771
+
772
+ function hasSeoTrademarkStuffing(text: string): boolean {
773
+ const values = [
774
+ ...[...text.matchAll(/<title[^>]*>([^<]{0,240})<\/title>/gis)].map((match) => match[1]),
775
+ ...[...text.matchAll(/"(?:title|children)"\s*:\s*"([^"]{0,240})"/gis)].map((match) => match[1]),
776
+ ...[...text.matchAll(/"(?:og:title|twitter:title)"\s*,\s*"content"\s*:\s*"([^"]{0,240})"/gis)].map((match) => match[1])
777
+ ];
778
+ return values.some((value) => (value.match(/[®™]/g) ?? []).length >= 2);
779
+ }
780
+
781
+ function hasLoginUiImageReference(text: string): boolean {
782
+ return /(?:imageData|alt|name|src|media|filename|fileName|url)["':\s{,[\]\\]*(?:[^"'<>]{0,160})?(?:screencapture|screenshot|screen[-_ ]?capture)/i.test(text) ||
783
+ /(?:screencapture|screenshot|screen[-_ ]?capture)[^"'<>]{0,160}\b(?:login|log[-_ ]?in|signin|sign[-_ ]?in|password|account)\b/i.test(text) ||
784
+ /\b(?:login|log[-_ ]?in|signin|sign[-_ ]?in|password|account)\b[^"'<>]{0,160}(?:screencapture|screenshot|screen[-_ ]?capture)/i.test(text);
785
+ }
786
+
787
+ function hasSuspiciousTargetContext(state: ScannerState): boolean {
788
+ if (incremented(state, "redirect.final_url_offsite")) return true;
789
+ // Genuine HOST suspicion only. A login-intent path ("/login", "/account") is
790
+ // benign — every legitimate login page has one — so suspicious_path_terms is
791
+ // deliberately excluded here.
792
+ return [...state.urls.values()].some((url) =>
793
+ isSourceOrFinalUrl(state, url.normalized) &&
794
+ url.flags.some((flag) => ["shared_hosting_subdomain", "generated_host_label", "suspicious_tld", "punycode", "ip_literal"].includes(flag))
795
+ );
796
+ }
797
+
798
+ function scanRedirectContext(state: ScannerState): void {
799
+ if (!state.source.url || !state.source.finalUrl || state.source.url === state.source.finalUrl) return;
800
+ const source = normalizeUrl(state.source.url);
801
+ const final = normalizeUrl(state.source.finalUrl, state.source.url);
802
+ if (source?.registrableDomain && final?.registrableDomain && source.registrableDomain !== final.registrableDomain) {
803
+ increment(state, "redirect.final_url_offsite");
804
+ addRuleFinding(state, urlRules.final_url_offsite_redirect, final.normalized, { source_url: source.normalized });
805
+ }
806
+ }
807
+
808
+ function scanTlsContext(state: ScannerState): void {
809
+ const tls = state.source.tls;
810
+ if (!tls) return;
811
+ const issuer = tls.issuer ?? "";
812
+ const subject = tls.subject ?? "";
813
+ if (tls.authorized === false) increment(state, "tls.unauthorized_certificate");
814
+ if (/(?:let'?s encrypt|zerossl|buypass|ssl\.com)/i.test(issuer)) increment(state, "tls.free_dv_certificate");
815
+ const organization = subject.match(/(?:^|,\s*)O\s*=\s*([^,]+)/i)?.[1]?.trim();
816
+ if (organization && !/^(?:cloudflare|google trust services|amazon|fastly|akamai|wix|netlify|vercel)\b/i.test(organization)) {
817
+ increment(state, "tls.organization_validated_certificate");
818
+ }
819
+ if (issuer) increment(state, `tls.issuer.${issuer.toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_|_$/g, "").slice(0, 80)}`);
820
+ }
821
+
822
+ function extractCssUrls(text: string): string[] {
823
+ const urls: string[] = [];
824
+ for (const match of text.matchAll(/@import\s+(?:url\(\s*)?["']?([^"')\s;]+)|url\(\s*["']?([^"')]+)["']?\s*\)/gi)) {
825
+ const raw = (match[1] ?? match[2] ?? "").trim().replace(/[.,;:]+$/, "");
826
+ if (/^https?:\/\//i.test(raw)) urls.push(raw);
827
+ }
828
+ return urls;
829
+ }
830
+
831
+ function addUrl(state: ScannerState, raw: string): void {
832
+ const normalized = normalizeUrl(raw, pageUrl(state));
833
+ if (!normalized) return;
834
+ state.urls.set(normalized.normalized, normalized);
835
+ for (const flag of normalized.flags) increment(state, `url.${flag}`);
836
+ if (normalized.flags.includes("punycode") && /login|signin|account|verify/i.test(normalized.normalized)) {
837
+ addRuleFinding(state, urlRules.punycode_login_url, normalized.normalized, {});
838
+ }
839
+ // Only when the scanned page itself IS, or redirects through, a shortener
840
+ // (cloaking) — not when its content merely links to one. Search engines,
841
+ // social, news and forums are full of bit.ly links in content.
842
+ if (normalized.destinationType === "url-shortener" && isSourceOrFinalUrl(state, normalized.normalized)) {
843
+ addRuleFinding(state, urlRules.redirect_to_url_shortener, normalized.normalized, {});
844
+ }
845
+ if (normalized.flags.includes("private_or_localhost") && isSourceOrFinalUrl(state, normalized.normalized)) {
846
+ addRuleFinding(state, urlRules.private_ip_url, normalized.normalized, {});
847
+ }
848
+ if (normalized.flags.includes("ip_literal") && !normalized.flags.includes("private_or_localhost")) {
849
+ addRuleFinding(state, urlRules.ip_literal_url, normalized.normalized, {});
850
+ }
851
+ if (normalized.flags.includes("suspicious_tld")) {
852
+ addRuleFinding(state, urlRules.suspicious_tld_url, normalized.normalized, {});
853
+ }
854
+ if (normalized.flags.includes("download_like_path") && normalized.relation === "off-site") {
855
+ addRuleFinding(state, urlRules.download_like_external_url, normalized.normalized, {});
856
+ }
857
+ if (normalized.flags.includes("malware_download_like_path") && isSourceOrFinalUrl(state, normalized.normalized)) {
858
+ addRuleFinding(state, urlRules.malware_download_like_url, normalized.normalized, {});
859
+ }
860
+ if (normalized.flags.includes("shared_hosting_subdomain") && isSourceOrFinalUrl(state, normalized.normalized)) {
861
+ addRuleFinding(state, urlRules.shared_hosting_subdomain_url, normalized.normalized, {});
862
+ }
863
+ const brand = unrelatedBrandInUrl(normalized);
864
+ if (brand && isSourceOrFinalUrl(state, normalized.normalized)) {
865
+ addRuleFinding(state, urlRules.brand_impersonation_url, normalized.normalized, { brand });
866
+ }
867
+ if (isSourceOrFinalUrl(state, normalized.normalized) && isCredentialPathOnSuspiciousHost(normalized)) {
868
+ addRuleFinding(state, urlRules.credential_path_on_suspicious_host, normalized.normalized, {});
869
+ }
870
+ if (isSourceOrFinalUrl(state, normalized.normalized) && isGeneratedSuspiciousLandingUrl(normalized)) {
871
+ addRuleFinding(state, urlRules.generated_landing_url, normalized.normalized, {});
872
+ }
873
+ }
874
+
875
+ function isSourceOrFinalUrl(state: ScannerState, normalizedUrl: string): boolean {
876
+ const source = state.source.url ? normalizeUrl(state.source.url)?.normalized : null;
877
+ const final = state.source.finalUrl ? normalizeUrl(state.source.finalUrl)?.normalized : null;
878
+ return normalizedUrl === source || normalizedUrl === final;
879
+ }
880
+
881
+ function addRuleFinding(state: ScannerState, rule: RuleDefinition, locationValue: string, metadata: Record<string, unknown>): void {
882
+ addFinding(state, rule.id, rule.severity, rule.confidence, rule.score, rule.title, rule.description, rule.locationType, locationValue, { ...metadata, rule_pack: rule.pack });
883
+ }
884
+
885
+ function addFinding(
886
+ state: ScannerState,
887
+ ruleId: string,
888
+ severity: Severity,
889
+ confidence: Confidence,
890
+ scoreModel: RuleScoreModel,
891
+ title: string,
892
+ description: string,
893
+ locationType: Finding["locationType"],
894
+ locationValue: string,
895
+ metadata: Record<string, unknown>
896
+ ): void {
897
+ const key = `${ruleId}:${locationType}:${locationValue}`;
898
+ if (state.findingKeys.has(key)) return;
899
+ state.findingKeys.add(key);
900
+ state.findings.push({
901
+ id: `${ruleId}:${state.findings.length}`,
902
+ ruleId,
903
+ severity,
904
+ confidence,
905
+ score: scoreModel.base,
906
+ scoreModel,
907
+ title,
908
+ description,
909
+ locationType,
910
+ locationValue,
911
+ metadata: { line: state.line, column: state.column, ...metadata }
912
+ });
913
+ }
914
+
915
+ function hiddenAttrs(attrs: Map<string, string>): boolean {
916
+ const width = Number(attrs.get("width") ?? "1");
917
+ const height = Number(attrs.get("height") ?? "1");
918
+ return width <= 1 || height <= 1 || /display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0/i.test(attrs.get("style") ?? "");
919
+ }
920
+
921
+ function decodeCandidate(kind: string, value: string, maxBytes: number): string | null {
922
+ try {
923
+ if (kind === "base64_decoded_string") {
924
+ const bytes = base64Decode(value);
925
+ if (!bytes || bytes.byteLength > maxBytes) return null;
926
+ const decoded = decodeText(bytes);
927
+ return isMostlyPrintable(decoded) ? decoded : null;
928
+ }
929
+ if (kind === "javascript_hex_escapes") return value.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))).slice(0, maxBytes);
930
+ if (kind === "javascript_unicode_escapes") return value.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))).slice(0, maxBytes);
931
+ if (kind === "fromcharcode_decoded_string") return value.split(",").map((part) => String.fromCharCode(Number(part.trim()))).join("").slice(0, maxBytes);
932
+ } catch {
933
+ return null;
934
+ }
935
+ return null;
936
+ }
937
+
938
+ function base64Decode(value: string): Uint8Array | null {
939
+ if (typeof atob === "function") {
940
+ const binary = atob(value);
941
+ return Uint8Array.from(binary, (char) => char.charCodeAt(0));
942
+ }
943
+ const bufferCtor = (globalThis as unknown as { Buffer?: { from(value: string, encoding: string): Uint8Array } }).Buffer;
944
+ return bufferCtor?.from(value, "base64") ?? null;
945
+ }
946
+
947
+ function dedupeFindings(findings: Finding[]): Finding[] {
948
+ const seen = new Set<string>();
949
+ return findings.filter((finding) => {
950
+ const key = `${finding.ruleId}:${finding.locationValue}`;
951
+ if (seen.has(key)) return false;
952
+ seen.add(key);
953
+ return true;
954
+ });
955
+ }
956
+
957
+ function relationFor(host: string, domain: string | null, baseHost: string, baseDomain: string | null): ExtractedUrl["relation"] {
958
+ if (!baseHost || !baseDomain || !domain) return "unknown";
959
+ if (host === baseHost) return "same-origin";
960
+ if (domain === baseDomain) return host.endsWith(`.${baseHost}`) ? "subdomain" : "same-site";
961
+ return "off-site";
962
+ }
963
+
964
+ function destinationTypeFor(url: URL, host: string): ExtractedUrl["destinationType"] {
965
+ if (isPrivateHost(host)) return host === "localhost" ? "localhost" : "private";
966
+ if (isIpLiteral(host)) return "ip";
967
+ if (isUrlShortener(host)) return "url-shortener";
968
+ if (url.protocol === "http:") return "http";
969
+ if (url.protocol === "https:") return "https";
970
+ return "other";
971
+ }
972
+
973
+ export function registrableDomainFor(host: string): string | null {
974
+ if (!host || isIpLiteral(host) || host === "localhost") return null;
975
+ const parts = host.toLowerCase().split(".").filter(Boolean);
976
+ if (parts.length < 2) return host;
977
+ const lastTwo = parts.slice(-2).join(".");
978
+ const lastThree = parts.slice(-3).join(".");
979
+ if (/^(?:co|com|net|org|gov|ac)\.[a-z]{2}$/.test(lastTwo) && parts.length >= 3) return lastThree;
980
+ return lastTwo;
981
+ }
982
+
983
+ function isIpLiteral(host: string): boolean {
984
+ return /^(?:\d{1,3}\.){3}\d{1,3}$/.test(host) || host.includes(":");
985
+ }
986
+
987
+ function isPrivateHost(host: string): boolean {
988
+ return host === "localhost" || /^127\.|^10\.|^192\.168\.|^172\.(?:1[6-9]|2\d|3[01])\./.test(host);
989
+ }
990
+
991
+ function isUrlShortener(host: string): boolean {
992
+ return /^(?:bit\.ly|t\.co|tinyurl\.com|goo\.gl|ow\.ly|is\.gd|buff\.ly|cutt\.ly)$/.test(host);
993
+ }
994
+
995
+ function isSharedHostingSubdomain(host: string, registrableDomain: string | null): boolean {
996
+ if (!registrableDomain || host === registrableDomain) return false;
997
+ return [
998
+ "wixstudio.com",
999
+ "wixsite.com",
1000
+ "webflow.io",
1001
+ "netlify.app",
1002
+ "vercel.app",
1003
+ "github.io",
1004
+ "pages.dev",
1005
+ "workers.dev",
1006
+ "edgeone.app",
1007
+ "edgeone.dev",
1008
+ "firebaseapp.com",
1009
+ "web.app",
1010
+ "herokuapp.com",
1011
+ "render.com",
1012
+ "glitch.me",
1013
+ "replit.app",
1014
+ "replit.dev",
1015
+ "wordpress.com",
1016
+ "blogspot.com",
1017
+ "weebly.com",
1018
+ "myshopify.com",
1019
+ "godaddysites.com",
1020
+ "zapier.app",
1021
+ "fwh.is",
1022
+ "infinityfreeapp.com",
1023
+ "000webhostapp.com",
1024
+ "fly.dev",
1025
+ "onrender.com",
1026
+ "surge.sh",
1027
+ "site.je"
1028
+ ].includes(registrableDomain);
1029
+ }
1030
+
1031
+ function isGeneratedHostLabel(host: string, registrableDomain: string | null): boolean {
1032
+ const label = host.split(".")[0] ?? "";
1033
+ if (!label || label === registrableDomain) return false;
1034
+ return /(?:client|account|secure|manager|payment|support|verify|login|area)[-_]?\d{5,}/i.test(label) ||
1035
+ /^[a-z]+(?:-[a-z]+){2,}-\d{4,}$/.test(label) ||
1036
+ /^[a-z0-9]{16,}$/.test(label) ||
1037
+ // A long hex run anywhere in the label (e.g. pub-de59803496c8489585895b6917266e7c.r2.dev).
1038
+ /[a-f0-9]{12,}/i.test(label) ||
1039
+ // A short all-hex label that includes a digit (0efbd9f, 0ed8a96, 0c4d4e6).
1040
+ (label.length >= 7 && /^[a-f0-9]+$/i.test(label) && /\d/.test(label)) ||
1041
+ // A short label that is ~half digits — the auto-generated bulk-phishing
1042
+ // naming scheme (000p4en, 000ogwl, 000o5eh), which no real brand uses.
1043
+ (label.length >= 6 && label.replace(/[^0-9]/g, "").length / label.length >= 0.4);
1044
+ }
1045
+
1046
+ // Well-known ad, analytics, and tag-manager networks. Scripts from these are
1047
+ // ubiquitous on legitimate ad-funded sites and are never phishing exfil
1048
+ // endpoints, so they should not raise the external-script signals that target
1049
+ // credential-harvest kits.
1050
+ const AD_ANALYTICS_DOMAINS = new Set([
1051
+ "doubleclick.net",
1052
+ "googlesyndication.com",
1053
+ "googletagmanager.com",
1054
+ "googletagservices.com",
1055
+ "google-analytics.com",
1056
+ "googleadservices.com",
1057
+ "adservice.google.com",
1058
+ "gstatic.com",
1059
+ "scorecardresearch.com",
1060
+ "quantserve.com",
1061
+ "quantcount.com",
1062
+ "criteo.com",
1063
+ "criteo.net",
1064
+ "taboola.com",
1065
+ "outbrain.com",
1066
+ "adnxs.com",
1067
+ "rubiconproject.com",
1068
+ "pubmatic.com",
1069
+ "casalemedia.com",
1070
+ "amazon-adsystem.com",
1071
+ "adsrvr.org",
1072
+ "moatads.com",
1073
+ "indexww.com",
1074
+ "openx.net",
1075
+ "3lift.com",
1076
+ "sharethrough.com",
1077
+ "permutive.com",
1078
+ "permutive.app",
1079
+ "cloudflareinsights.com",
1080
+ "newrelic.com",
1081
+ "nr-data.net",
1082
+ "segment.com",
1083
+ "segment.io",
1084
+ "optimizely.com",
1085
+ "hotjar.com",
1086
+ "chartbeat.com",
1087
+ "parsely.com",
1088
+ "branch.io",
1089
+ "onetrust.com",
1090
+ "cookielaw.org",
1091
+ "fbcdn.net",
1092
+ "facebook.net"
1093
+ ]);
1094
+
1095
+ export function isAdOrAnalyticsHost(normalizedUrl: string): boolean {
1096
+ try {
1097
+ const host = new URL(normalizedUrl).hostname.toLowerCase();
1098
+ return AD_ANALYTICS_DOMAINS.has(registrableDomainFor(host) ?? host);
1099
+ } catch {
1100
+ return false;
1101
+ }
1102
+ }
1103
+
1104
+ function isSuspiciousTld(host: string): boolean {
1105
+ const tld = host.split(".").pop() ?? "";
1106
+ return /^(?:zip|mov|top|xyz|click|country|gq|tk|ml|cf|ga|work|quest|cam|cfd|icu|buzz)$/.test(tld);
1107
+ }
1108
+
1109
+ function isMalwareDownloadLikePath(pathname: string): boolean {
1110
+ return /(?:\/|^)(?:bin|bins|payload|update|loader|bot|mozi|mirai|gafgyt|boatnet|dvr)(?:[./_-]|$)|\.(?:sh|bash|elf|bin|mips|mpsl|arm\d?|x86|x86_64|i686|ppc|sparc)(?:$|[?#])|(?:\/|^)(?:mips|arm\d?|x86|x86_64|i686|ppc|sparc)(?:$|[./_-])/i.test(pathname);
1111
+ }
1112
+
1113
+ // Brand keywords + the brand's legitimate registrable domains. Matched against
1114
+ // HOST LABELS only (never the path/query — so google.com/search?q=paypal is
1115
+ // safe), as an exact label or, for >=6-char keywords, a label prefix to catch
1116
+ // concatenated lookalikes like "scotiawealthmanagement.com.evil.tld".
1117
+ const PHISH_BRANDS: Array<{ brand: string; keywords: string[]; allowed: RegExp }> = [
1118
+ { brand: "google", keywords: ["google", "gmail"], allowed: /(?:^|\.)(?:google|gmail)\.(?:com|[a-z]{2})$/i },
1119
+ { brand: "microsoft", keywords: ["microsoft", "office365", "outlook", "onedrive"], allowed: /(?:^|\.)(?:microsoft|microsoftonline|live|office|outlook|sharepoint)\.com$/i },
1120
+ { brand: "apple", keywords: ["icloud", "appleid"], allowed: /(?:^|\.)(?:apple|icloud)\.com$/i },
1121
+ { brand: "paypal", keywords: ["paypal", "paypa1"], allowed: /(?:^|\.)paypal\.(?:com|[a-z]{2})$/i },
1122
+ { brand: "amazon", keywords: ["amazon"], allowed: /(?:^|\.)(?:amazon\.[a-z.]{2,6}|amazonaws\.com|aws\.amazon\.com)$/i },
1123
+ { brand: "netflix", keywords: ["netflix"], allowed: /(?:^|\.)netflix\.com$/i },
1124
+ { brand: "facebook", keywords: ["facebook"], allowed: /(?:^|\.)(?:facebook|meta)\.com$/i },
1125
+ { brand: "instagram", keywords: ["instagram"], allowed: /(?:^|\.)instagram\.com$/i },
1126
+ { brand: "whatsapp", keywords: ["whatsapp"], allowed: /(?:^|\.)whatsapp\.com$/i },
1127
+ { brand: "linkedin", keywords: ["linkedin"], allowed: /(?:^|\.)linkedin\.com$/i },
1128
+ { brand: "dropbox", keywords: ["dropbox"], allowed: /(?:^|\.)dropbox\.com$/i },
1129
+ { brand: "docusign", keywords: ["docusign"], allowed: /(?:^|\.)docusign\.(?:com|net)$/i },
1130
+ { brand: "wetransfer", keywords: ["wetransfer"], allowed: /(?:^|\.)wetransfer\.com$/i },
1131
+ { brand: "dhl", keywords: ["dhl"], allowed: /(?:^|\.)dhl\.(?:com|[a-z]{2})$/i },
1132
+ { brand: "fedex", keywords: ["fedex"], allowed: /(?:^|\.)fedex\.com$/i },
1133
+ { brand: "usps", keywords: ["usps"], allowed: /(?:^|\.)usps\.com$/i },
1134
+ { brand: "roblox", keywords: ["roblox"], allowed: /(?:^|\.)roblox\.com$/i },
1135
+ { brand: "steam", keywords: ["steamcommunity", "steampowered"], allowed: /(?:^|\.)steam(?:community|powered)\.com$/i },
1136
+ { brand: "scotiabank", keywords: ["scotiabank", "scotiawealth", "scotiaonline"], allowed: /(?:^|\.)scotiabank\.com$/i },
1137
+ { brand: "wellsfargo", keywords: ["wellsfargo"], allowed: /(?:^|\.)wellsfargo\.com$/i },
1138
+ { brand: "chase", keywords: ["chase"], allowed: /(?:^|\.)chase\.com$/i },
1139
+ { brand: "bankofamerica", keywords: ["bankofamerica"], allowed: /(?:^|\.)bankofamerica\.com$/i },
1140
+ { brand: "citi", keywords: ["citibank", "citigroup"], allowed: /(?:^|\.)citi\.com$/i },
1141
+ { brand: "coinbase", keywords: ["coinbase"], allowed: /(?:^|\.)coinbase\.com$/i },
1142
+ { brand: "binance", keywords: ["binance"], allowed: /(?:^|\.)binance\.(?:com|us)$/i },
1143
+ { brand: "kraken", keywords: ["kraken"], allowed: /(?:^|\.)kraken\.com$/i },
1144
+ { brand: "metamask", keywords: ["metamask"], allowed: /(?:^|\.)metamask\.io$/i },
1145
+ { brand: "ledger", keywords: ["ledger"], allowed: /(?:^|\.)ledger\.com$/i },
1146
+ { brand: "tangem", keywords: ["tangem"], allowed: /(?:^|\.)tangem\.com$/i },
1147
+ { brand: "etoro", keywords: ["etoro"], allowed: /(?:^|\.)etoro\.com$/i },
1148
+ { brand: "ionos", keywords: ["ionos"], allowed: /(?:^|\.)ionos\.(?:com|de|co\.uk)$/i },
1149
+ { brand: "allegro", keywords: ["allegro"], allowed: /(?:^|\.)allegro\.(?:pl|com)$/i }
1150
+ ];
1151
+
1152
+ // Normalize leetspeak / homoglyph substitutions so g00gle, paypa1, micr0s0ft,
1153
+ // 0utlook, faceb00k collapse onto their brand spelling. "1" is ambiguous (i or
1154
+ // l), so callers check both variants. Non-alphanumerics are dropped last.
1155
+ function deleet(label: string, one: "i" | "l"): string {
1156
+ return label
1157
+ .replace(/0/g, "o")
1158
+ .replace(/1/g, one)
1159
+ .replace(/3/g, "e")
1160
+ .replace(/4/g, "a")
1161
+ .replace(/5/g, "s")
1162
+ .replace(/7/g, "t")
1163
+ .replace(/8/g, "b")
1164
+ .replace(/9/g, "g")
1165
+ .replace(/\$/g, "s")
1166
+ .replace(/@/g, "a")
1167
+ .replace(/!/g, "i")
1168
+ .replace(/[^a-z]/g, "");
1169
+ }
1170
+
1171
+ function unrelatedBrandInUrl(url: ExtractedUrl): string | null {
1172
+ let host: string;
1173
+ try {
1174
+ host = new URL(url.normalized).hostname.toLowerCase();
1175
+ } catch {
1176
+ return null;
1177
+ }
1178
+ const registrable = registrableDomainFor(host) ?? host;
1179
+ // Subdomain portion (everything left of the registrable domain) and the
1180
+ // registrable's main label.
1181
+ const subPart = host.endsWith(registrable) ? host.slice(0, host.length - registrable.length).replace(/\.$/, "") : host;
1182
+ const subLabels = subPart ? subPart.split(/[.\-_]/).filter(Boolean) : [];
1183
+ const subVariants = [...new Set(subLabels.flatMap((label) => [label, deleet(label, "i"), deleet(label, "l")]))];
1184
+ const mainLabel = registrable.split(".")[0] ?? "";
1185
+ const mainVariants = [deleet(mainLabel, "i"), deleet(mainLabel, "l")];
1186
+
1187
+ for (const { brand, keywords, allowed } of PHISH_BRANDS) {
1188
+ if (allowed.test(host)) continue;
1189
+ for (const kw of keywords) {
1190
+ // Brand in a SUBDOMAIN label => impersonation (paypal.com.evil.xyz,
1191
+ // coinbase_v_login.godaddysites.com, scotiawealth*.cobblestonesw.com).
1192
+ if (subVariants.some((label) => label === kw || (kw.length >= 6 && label.startsWith(kw)))) return brand;
1193
+ // Brand as a leet/homoglyph typosquat of the apex label (g00gle.com,
1194
+ // paypa1.net). An EXACT brand apex label (google.com, google.co.uk) is the
1195
+ // brand's own domain and is intentionally not flagged here — that keeps
1196
+ // ccTLDs from reading as impersonation.
1197
+ if (mainLabel !== kw && mainVariants.includes(kw)) return brand;
1198
+ }
1199
+ }
1200
+ return null;
1201
+ }
1202
+
1203
+ const SUSPICIOUS_HOST_FLAGS = ["shared_hosting_subdomain", "generated_host_label", "suspicious_tld", "punycode", "ip_literal", "url_shortener"];
1204
+
1205
+ export interface RedirectAssessment {
1206
+ /** The redirect crossed to a different registrable domain (not just a subdomain hop). */
1207
+ offSite: boolean;
1208
+ /** The destination host itself looks suspicious (shortener, suspicious TLD, punycode, IP, shared/generated host). */
1209
+ destinationSuspicious: boolean;
1210
+ requestedRegistrable: string;
1211
+ finalRegistrable: string;
1212
+ destinationFlags: string[];
1213
+ }
1214
+
1215
+ // Single source of truth for "did this redirect leave the site, and is the
1216
+ // destination itself sketchy?" — shared by every crawler (Worker stream + Fly/
1217
+ // CLI runner) so a redirect like google.com -> www.google.com (same registrable
1218
+ // domain) or google.com -> google.de (different domain, ordinary host) is not
1219
+ // convicted, while a hop to a shortener/punycode/IP/shared host is flagged.
1220
+ export function assessRedirect(requestedUrl: string, finalUrl: string): RedirectAssessment | null {
1221
+ let requested: URL;
1222
+ let final: URL;
1223
+ try {
1224
+ requested = new URL(requestedUrl);
1225
+ final = new URL(finalUrl);
1226
+ } catch {
1227
+ return null;
1228
+ }
1229
+ const requestedRegistrable = registrableDomainFor(requested.hostname) ?? requested.hostname;
1230
+ const finalRegistrable = registrableDomainFor(final.hostname) ?? final.hostname;
1231
+ const offSite = requestedRegistrable !== finalRegistrable;
1232
+ const destinationFlags = offSite ? normalizeUrl(final.href)?.flags ?? [] : [];
1233
+ const destinationSuspicious = destinationFlags.some((flag) => SUSPICIOUS_HOST_FLAGS.includes(flag));
1234
+ return { offSite, destinationSuspicious, requestedRegistrable, finalRegistrable, destinationFlags };
1235
+ }
1236
+
1237
+ // Login/account/verify path served from a host that legitimate brands never use
1238
+ // for credentials. Render-free — fires on the URL alone, before any form loads.
1239
+ function isCredentialPathOnSuspiciousHost(url: ExtractedUrl): boolean {
1240
+ return url.flags.includes("suspicious_path_terms") && url.flags.some((flag) => SUSPICIOUS_HOST_FLAGS.includes(flag));
1241
+ }
1242
+
1243
+ function isGeneratedSuspiciousLandingUrl(url: ExtractedUrl): boolean {
1244
+ const parsed = new URL(url.normalized);
1245
+ const host = parsed.hostname.toLowerCase();
1246
+ const firstLabel = host.split(".")[0] ?? "";
1247
+ const path = parsed.pathname.toLowerCase();
1248
+ const generatedLabel = /^[a-z]{6,10}$/.test(firstLabel) || /^[a-z0-9]{8,18}$/.test(firstLabel);
1249
+ const uuidPath = /\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}(?:\/|$)/i.test(path);
1250
+ const fakeUpdateHost = /\.(?:casino|sbs|xyz|top|click|app|co)$/.test(host) || /(?:bet|casino|poker|winx|winsport|perfectgame|parspoker|venusbet)/i.test(host);
1251
+ return generatedLabel && uuidPath && fakeUpdateHost;
1252
+ }
1253
+
1254
+ function hasElfMagic(bytes: Uint8Array): boolean {
1255
+ return bytes.length >= 4 && bytes[0] === 0x7f && bytes[1] === 0x45 && bytes[2] === 0x4c && bytes[3] === 0x46;
1256
+ }
1257
+
1258
+ function declaredNonExecutableBinary(contentType?: string | null): boolean {
1259
+ const value = (contentType ?? "").toLowerCase().split(";")[0].trim();
1260
+ return !!value && !/(?:elf|executable|x-executable|x-pie-executable|octet-stream)/.test(value);
1261
+ }
1262
+
1263
+ function likelyBinaryStrings(text: string): boolean {
1264
+ return /(?:\/bin\/sh|\/dev\/shm|\/proc\/net\/route|iptables|busybox|cfgtool|sendcmd|\[cnc\]|1:q9:find_node|Mozi\.)/i.test(text);
1265
+ }
1266
+
1267
+ function elfHasWritableExecutableStack(bytes: Uint8Array): boolean {
1268
+ if (!hasElfMagic(bytes) || bytes.length < 52) return false;
1269
+ const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
1270
+ const littleEndian = bytes[5] !== 2;
1271
+ const elfClass = bytes[4];
1272
+ const programHeaderOffset = elfClass === 2
1273
+ ? Number(dataView.getBigUint64(32, littleEndian))
1274
+ : dataView.getUint32(28, littleEndian);
1275
+ const programHeaderEntrySize = dataView.getUint16(elfClass === 2 ? 54 : 42, littleEndian);
1276
+ const programHeaderCount = dataView.getUint16(elfClass === 2 ? 56 : 44, littleEndian);
1277
+ if (!programHeaderOffset || !programHeaderEntrySize || !programHeaderCount) return false;
1278
+ const PT_GNU_STACK = 0x6474e551;
1279
+ const PF_X = 0x1;
1280
+ const PF_W = 0x2;
1281
+ for (let index = 0; index < programHeaderCount; index += 1) {
1282
+ const offset = programHeaderOffset + index * programHeaderEntrySize;
1283
+ if (offset + 8 > bytes.length) return false;
1284
+ const type = dataView.getUint32(offset, littleEndian);
1285
+ const flags = elfClass === 2
1286
+ ? dataView.getUint32(offset + 4, littleEndian)
1287
+ : dataView.getUint32(offset + 24, littleEndian);
1288
+ if (type === PT_GNU_STACK && (flags & PF_X) && (flags & PF_W)) return true;
1289
+ }
1290
+ return false;
1291
+ }
1292
+
1293
+ function scanTechnologyFingerprint(state: ScannerState, text: string, locationValue: string): void {
1294
+ if (/\bjquery[-.]1\.\d+(?:\.\d+)?(?:\.min)?\.js\b|jQuery v1\./i.test(text)) {
1295
+ addRuleFinding(state, htmlTechnologyRules.legacy_jquery_reference, locationValue, {});
1296
+ }
1297
+ if (/\bangular(?:\.min)?\.js\b|angularjs|AngularJS v1\.|angular\.version/i.test(text)) {
1298
+ addRuleFinding(state, htmlTechnologyRules.legacy_angularjs_reference, locationValue, {});
1299
+ }
1300
+ if (/\bbootstrap(?:\.min)?\.js\b|bootstrap[-.]3\.\d+(?:\.\d+)?(?:\.min)?\.js\b|Bootstrap v3\./i.test(text)) {
1301
+ addRuleFinding(state, htmlTechnologyRules.legacy_bootstrap_reference, locationValue, {});
1302
+ }
1303
+ if (/\blodash[-.]4\.17\.(?:[0-9]|1[0-9]|20)(?:\.min)?\.js\b|lodash v4\.17\.(?:[0-9]|1[0-9]|20)/i.test(text)) {
1304
+ addRuleFinding(state, htmlTechnologyRules.legacy_lodash_reference, locationValue, {});
1305
+ }
1306
+ if (/(?:sites\/default\/files|drupal-settings-json|Drupal\.settings|\/core\/misc\/drupal\.js)/i.test(text)) {
1307
+ addRuleFinding(state, htmlTechnologyRules.drupal_surface_reference, locationValue, {});
1308
+ }
1309
+ if (/\b(?:phpMyAdmin|pma_navigation|\/phpmyadmin\/|\/pma\/)\b/i.test(text)) {
1310
+ addRuleFinding(state, htmlTechnologyRules.phpmyadmin_surface_reference, locationValue, {});
1311
+ }
1312
+ }
1313
+
1314
+ function pageUrl(state: ScannerState): string | undefined {
1315
+ return state.source.finalUrl ?? state.source.url ?? state.source.originUrl;
1316
+ }
1317
+
1318
+ function pageHost(state: ScannerState): string | null {
1319
+ const url = pageUrl(state);
1320
+ if (!url) return null;
1321
+ try {
1322
+ return new URL(url).hostname.toLowerCase();
1323
+ } catch {
1324
+ return null;
1325
+ }
1326
+ }
1327
+
1328
+ function decodeText(bytes: Uint8Array): string {
1329
+ return new TextDecoder("utf-8", { fatal: false }).decode(bytes);
1330
+ }
1331
+
1332
+ function trimWindow(value: string, max: number): string {
1333
+ return value.length <= max ? value : value.slice(value.length - max);
1334
+ }
1335
+
1336
+ function updatePosition(state: ScannerState, text: string): void {
1337
+ for (const char of text) {
1338
+ if (char === "\n") {
1339
+ state.line += 1;
1340
+ state.column = 1;
1341
+ } else {
1342
+ state.column += 1;
1343
+ }
1344
+ }
1345
+ state.counters.lines_seen = state.line;
1346
+ state.counters.bytes_seen = state.absoluteOffset;
1347
+ }
1348
+
1349
+ function byteLength(text: string): number {
1350
+ return new TextEncoder().encode(text).byteLength;
1351
+ }
1352
+
1353
+ function isMostlyPrintable(text: string): boolean {
1354
+ if (!text) return false;
1355
+ const sample = text.slice(0, 4096);
1356
+ const printable = [...sample].filter((char) => char === "\n" || char === "\r" || char === "\t" || (char >= " " && char !== "\uFFFD")).length;
1357
+ return printable / sample.length >= 0.85;
1358
+ }
1359
+
1360
+ function increment(state: ScannerState, key: string): void {
1361
+ state.counters[key] = (state.counters[key] ?? 0) + 1;
1362
+ }
1363
+
1364
+ function incremented(state: ScannerState, key: string): boolean {
1365
+ return (state.counters[key] ?? 0) > 0;
1366
+ }