@q32/signal-scanner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/dynamic.ts ADDED
@@ -0,0 +1,273 @@
1
+ // Dynamic JavaScript behavior analysis.
2
+ //
3
+ // Runs a page's inline scripts against an instrumented window/document where the
4
+ // dangerous primitives (eval/Function, fetch/XHR/sendBeacon/WebSocket, location,
5
+ // document.write/innerHTML, form.action, atob, cookies) are NEUTERED and
6
+ // RECORDED rather than executed. What the script *tries* to do is the signal:
7
+ // JS redirects (decloaking), exfil endpoints, injected credential forms,
8
+ // decoded payloads. Recorded markup and eval'd code are re-fed through the
9
+ // static scanner so existing rules light up on runtime-produced content.
10
+ //
11
+ // This module is isolate-agnostic on purpose. It exposes:
12
+ // - RECORDER_SOURCE: the recorder as self-contained source, for a caller to
13
+ // run inside whatever isolate it has (a CF Dynamic Worker, node isolated-vm,
14
+ // a node:vm context, ...). The lib knows nothing about those mechanisms.
15
+ // - runInstrumented(): an in-process default (compiles RECORDER_SOURCE here)
16
+ // for when no isolate boundary is needed.
17
+ // - analyzeDynamicWith(html, opts, evaluate): the generic seam — the caller
18
+ // passes an `evaluate` that produces a BehaviorReport however it likes.
19
+
20
+ import { assessRedirect, createScanner, isAdOrAnalyticsHost, registrableDomainFor, type Finding, type Severity, type Confidence } from "./index";
21
+ import type { RuleScoreModel } from "./rules/types";
22
+
23
+ export interface NetworkAttempt {
24
+ kind: "fetch" | "xhr" | "beacon" | "websocket" | "script" | "image" | "form";
25
+ url: string;
26
+ }
27
+
28
+ export interface BehaviorReport {
29
+ redirects: string[];
30
+ network: NetworkAttempt[];
31
+ writes: string[];
32
+ evals: string[];
33
+ decoded: string[];
34
+ cookies: string[];
35
+ errors: string[];
36
+ }
37
+
38
+ export interface DynamicAnalysisOptions {
39
+ /** Base/page URL, used to resolve relative targets and classify off-origin. */
40
+ url?: string;
41
+ }
42
+
43
+ /** The caller supplies one of these: "run these scripts in an isolate, give me what they attempted." */
44
+ export type IsolatedEvaluator = (scripts: string[], options: DynamicAnalysisOptions) => BehaviorReport | Promise<BehaviorReport>;
45
+
46
+ const EMPTY_REPORT: BehaviorReport = { redirects: [], network: [], writes: [], evals: [], decoded: [], cookies: [], errors: [] };
47
+
48
+ // Self-contained recorder. Evaluating this source defines `recordBehavior(scripts, url)`
49
+ // which returns a BehaviorReport. It references only standard globals (URL, atob,
50
+ // btoa, Proxy) available in any JS isolate — no imports, no transport, no
51
+ // assumptions about how it is hosted. Inline scripts run via `new Function` with
52
+ // the dangerous globals shadowed by recorder stubs (sloppy mode so `eval` /
53
+ // `Function` can be shadowed as parameters).
54
+ export const RECORDER_SOURCE = String.raw`
55
+ function recordBehavior(scripts, url) {
56
+ var report = { redirects: [], network: [], writes: [], evals: [], decoded: [], cookies: [], errors: [] };
57
+ var resolve = function (v) { var raw = String(v == null ? "" : v); try { return url ? new URL(raw, url).toString() : raw; } catch (e) { return raw; } };
58
+ var recordEval = function (c) { report.evals.push(String(c)); return undefined; };
59
+ var FunctionStub = function () { var a = arguments; report.evals.push(String(a[a.length - 1] == null ? "" : a[a.length - 1])); return function () {}; };
60
+ var locationProxy = new Proxy({ href: url || "", assign: function (u) { report.redirects.push(resolve(u)); }, replace: function (u) { report.redirects.push(resolve(u)); }, reload: function () {}, toString: function () { return url || ""; } }, { set: function (t, p, val) { if (p === "href") report.redirects.push(resolve(val)); t[p] = val; return true; } });
61
+ var noop = function () {};
62
+ var makeElement = function (tag) {
63
+ var el = { tagName: String(tag).toUpperCase(), style: {}, children: [], attributes: {}, dataset: {}, classList: { add: noop, remove: noop, toggle: noop, contains: function () { return false; } } };
64
+ var s = "", a = "";
65
+ Object.defineProperty(el, "src", { get: function () { return s; }, set: function (v) { s = String(v); report.network.push({ kind: el.tagName === "IMG" ? "image" : "script", url: resolve(v) }); } });
66
+ Object.defineProperty(el, "action", { get: function () { return a; }, set: function (v) { a = String(v); report.network.push({ kind: "form", url: resolve(v) }); } });
67
+ Object.defineProperty(el, "innerHTML", { get: function () { return ""; }, set: function (v) { report.writes.push(String(v)); } });
68
+ Object.defineProperty(el, "outerHTML", { get: function () { return ""; }, set: function (v) { report.writes.push(String(v)); } });
69
+ el.insertAdjacentHTML = function (pos, html) { report.writes.push(String(html)); };
70
+ el.setAttribute = function (k, v) { if (k === "src") el.src = v; else if (k === "action") el.action = v; else el.attributes[k] = v; };
71
+ el.getAttribute = function (k) { return el.attributes[k] != null ? el.attributes[k] : null; };
72
+ el.appendChild = function (c) { return c; }; el.removeChild = function (c) { return c; }; el.remove = noop; el.addEventListener = noop; el.removeEventListener = noop;
73
+ // DOM queries on an element return instrumented elements too, so a chained
74
+ // injection (container.querySelector('.x').innerHTML = ...) still records.
75
+ el.querySelector = function () { return makeElement("div"); };
76
+ el.querySelectorAll = function () { return [makeElement("div")]; };
77
+ el.getElementsByTagName = function (t) { return [makeElement(t)]; };
78
+ el.getElementsByClassName = function () { return [makeElement("div")]; };
79
+ return el;
80
+ };
81
+ // getElementById/querySelector return INSTRUMENTED elements (not null), so the
82
+ // most common injection pattern — getElementById('x').innerHTML = '<form>...' —
83
+ // is recorded instead of throwing on null and aborting the whole script.
84
+ var documentShim = { write: function () { report.writes.push(Array.prototype.map.call(arguments, String).join("")); }, writeln: function () { report.writes.push(Array.prototype.map.call(arguments, String).join("")); }, createElement: function (t) { return makeElement(String(t)); }, getElementById: function () { return makeElement("div"); }, getElementsByTagName: function (t) { return [makeElement(String(t))]; }, getElementsByClassName: function () { return [makeElement("div")]; }, querySelector: function () { return makeElement("div"); }, querySelectorAll: function () { return [makeElement("div")]; }, addEventListener: noop, body: makeElement("body"), head: makeElement("head"), documentElement: makeElement("html"), location: locationProxy };
85
+ Object.defineProperty(documentShim, "cookie", { get: function () { return ""; }, set: function (v) { report.cookies.push(String(v)); } });
86
+ var safeAtob = function (v) { var out; try { out = atob(String(v)); } catch (e) { out = String(v); } report.decoded.push(out); return out; };
87
+ var safeBtoa = function (v) { try { return btoa(String(v)); } catch (e) { return String(v); } };
88
+ var win = {
89
+ document: documentShim, location: locationProxy,
90
+ navigator: { userAgent: "Mozilla/5.0", platform: "Win32", language: "en-US", sendBeacon: function (u) { report.network.push({ kind: "beacon", url: resolve(u) }); return true; } },
91
+ screen: { width: 1920, height: 1080 },
92
+ localStorage: { getItem: function () { return null; }, setItem: function () {}, removeItem: function () {} },
93
+ sessionStorage: { getItem: function () { return null; }, setItem: function () {}, removeItem: function () {} },
94
+ atob: safeAtob, btoa: safeBtoa,
95
+ fetch: function (u) { report.network.push({ kind: "fetch", url: resolve(u) }); return Promise.resolve({ ok: true, status: 200, json: function () { return Promise.resolve({}); }, text: function () { return Promise.resolve(""); } }); },
96
+ XMLHttpRequest: function () { return { open: function (m, u) { report.network.push({ kind: "xhr", url: resolve(u) }); }, send: function () {}, setRequestHeader: function () {}, addEventListener: function () {} }; },
97
+ WebSocket: function (u) { report.network.push({ kind: "websocket", url: resolve(u) }); return { send: function () {}, close: function () {} }; },
98
+ eval: recordEval, Function: FunctionStub,
99
+ setTimeout: function (fn) { if (typeof fn === "string") report.evals.push(fn); return 0; },
100
+ setInterval: function (fn) { if (typeof fn === "string") report.evals.push(fn); return 0; },
101
+ addEventListener: function () {}, console: { log: function () {}, warn: function () {}, error: function () {} }
102
+ };
103
+ win.window = win; win.self = win; win.globalThis = win; win.top = win;
104
+ var params = { window: win, self: win, globalThis: win, document: documentShim, location: locationProxy, navigator: win.navigator, fetch: win.fetch, XMLHttpRequest: win.XMLHttpRequest, WebSocket: win.WebSocket, eval: recordEval, Function: FunctionStub, atob: safeAtob, btoa: safeBtoa, setTimeout: win.setTimeout, setInterval: win.setInterval, localStorage: win.localStorage, console: win.console };
105
+ var keys = Object.keys(params), vals = keys.map(function (k) { return params[k]; });
106
+ var list = Array.isArray(scripts) ? scripts.slice(0, 64) : [];
107
+ for (var i = 0; i < list.length; i++) {
108
+ var script = list[i];
109
+ if (typeof script !== "string" || script.length > 262144) { report.errors.push("script skipped"); continue; }
110
+ try { Function.apply(null, keys.concat([script])).apply(null, vals); }
111
+ catch (e) { report.errors.push(e && e.message ? e.message : "script error"); }
112
+ }
113
+ return report;
114
+ }
115
+ `;
116
+
117
+ let compiledRecorder: ((scripts: string[], url?: string) => BehaviorReport) | null = null;
118
+ function inProcessRecorder(): (scripts: string[], url?: string) => BehaviorReport {
119
+ if (!compiledRecorder) {
120
+ // eslint-disable-next-line no-new-func
121
+ compiledRecorder = new Function(`${RECORDER_SOURCE}\nreturn recordBehavior;`)() as (scripts: string[], url?: string) => BehaviorReport;
122
+ }
123
+ return compiledRecorder;
124
+ }
125
+
126
+ /** Extract inline <script> bodies (no src) from HTML. */
127
+ export function extractInlineScripts(html: string): string[] {
128
+ const scripts: string[] = [];
129
+ for (const match of html.matchAll(/<script\b([^>]*)>([\s\S]*?)<\/script>/gi)) {
130
+ const attrs = match[1] ?? "";
131
+ if (/\bsrc\s*=/i.test(attrs)) continue; // external scripts are fetched + scanned separately
132
+ if (/\btype\s*=\s*["']?(?:application\/json|application\/ld\+json|text\/template)/i.test(attrs)) continue;
133
+ const body = match[2]?.trim();
134
+ if (body) scripts.push(body);
135
+ }
136
+ return scripts;
137
+ }
138
+
139
+ // Source URLs of external scripts the page loads (<script src=...>). A renderer
140
+ // fetches and runs these against the DOM, where externally-injected forms appear.
141
+ export function extractScriptSources(html: string): string[] {
142
+ const sources: string[] = [];
143
+ for (const match of html.matchAll(/<script\b[^>]*\bsrc\s*=\s*["']([^"']+)["'][^>]*>/gi)) {
144
+ if (match[1]) sources.push(match[1]);
145
+ }
146
+ return [...new Set(sources)];
147
+ }
148
+
149
+ /**
150
+ * In-process default evaluator. Runs the recorder in THIS isolate (no boundary).
151
+ * Use analyzeDynamicWith with a caller-supplied evaluator when isolation matters.
152
+ */
153
+ export function runInstrumented(scripts: string[], options: DynamicAnalysisOptions = {}): BehaviorReport {
154
+ try {
155
+ return inProcessRecorder()(scripts, options.url);
156
+ } catch (error) {
157
+ return { ...EMPTY_REPORT, errors: [error instanceof Error ? error.message : "recorder failed"] };
158
+ }
159
+ }
160
+
161
+ /** Full in-process pass: extract inline scripts, record behavior, turn it into findings. */
162
+ export function analyzeDynamic(html: string, options: DynamicAnalysisOptions = {}): { report: BehaviorReport; findings: Finding[] } {
163
+ const report = runInstrumented(extractInlineScripts(html), options);
164
+ return { report, findings: behaviorFindings(report, options.url) };
165
+ }
166
+
167
+ /** Generic seam: the caller supplies how scripts are evaluated (in whatever isolate it has). */
168
+ export async function analyzeDynamicWith(
169
+ html: string,
170
+ options: DynamicAnalysisOptions,
171
+ evaluate: IsolatedEvaluator
172
+ ): Promise<{ report: BehaviorReport; findings: Finding[] }> {
173
+ const scripts = extractInlineScripts(html);
174
+ const report = scripts.length ? await evaluate(scripts, options) : EMPTY_REPORT;
175
+ return { report, findings: behaviorFindings(report, options.url) };
176
+ }
177
+
178
+ const EXFIL_SCORE: RuleScoreModel = { base: 72, tags: ["exfiltration", "script"] };
179
+ const OFFSITE_REQUEST_SCORE: RuleScoreModel = { base: 8, tags: ["script"] };
180
+ const REDIRECT_SCORE: RuleScoreModel = { base: 45, tags: ["redirect", "script"] };
181
+ // eval/Function/string-timer use is ubiquitous in legitimate bundles — weak
182
+ // alone. The re-scan of what they produce (below) is where real convictions come from.
183
+ const EVAL_SCORE: RuleScoreModel = { base: 12, tags: ["obfuscation", "script"], maxGroup: "dynamic-code" };
184
+
185
+ /** Map recorded behavior to scanner findings, re-scanning injected markup and decoded/eval'd code. */
186
+ export function behaviorFindings(report: BehaviorReport, baseUrl?: string): Finding[] {
187
+ const findings: Finding[] = [];
188
+ let i = 0;
189
+ const add = (severity: Severity, confidence: Confidence, model: RuleScoreModel, ruleId: string, title: string, description: string, location: string, metadata: Record<string, unknown>) => {
190
+ findings.push({ id: `${ruleId}:${i++}`, ruleId, severity, confidence, score: model.base, scoreModel: model, title, description, locationType: "javascript", locationValue: location, metadata });
191
+ };
192
+
193
+ for (const target of report.network) {
194
+ // An off-site runtime request is NOT exfiltration on its own — legit sites
195
+ // constantly fetch their own subdomains, CDNs, payment processors, analytics
196
+ // and APIs. Only convict (high) when the destination host ITSELF looks
197
+ // suspicious (shortener, suspicious TLD, punycode, IP literal, shared/
198
+ // generated host) — the actual sketchy-endpoint exfil pattern. A request to
199
+ // an ordinary off-site domain is recorded as a low-signal note, not a flag.
200
+ if (!isThirdPartyTarget(target.url, baseUrl)) continue;
201
+ const suspiciousDestination = baseUrl ? assessRedirect(baseUrl, target.url)?.destinationSuspicious ?? false : false;
202
+ if (suspiciousDestination) {
203
+ add("high", "high", EXFIL_SCORE, "runtime_offsite_exfil", `Runtime ${target.kind} to a suspicious off-site endpoint`, `Page JavaScript issued a ${target.kind} request to a suspicious unrelated host at runtime — a common credential/data exfiltration pattern.`, target.url, { kind: target.kind });
204
+ } else {
205
+ add("info", "low", OFFSITE_REQUEST_SCORE, "runtime_offsite_request", `Runtime ${target.kind} to an unrelated domain`, `Page JavaScript issued a ${target.kind} request to an unrelated (but not obviously suspicious) domain at runtime.`, target.url, { kind: target.kind });
206
+ }
207
+ }
208
+ for (const redirect of report.redirects) {
209
+ if (isThirdPartyTarget(redirect, baseUrl)) {
210
+ add("medium", "high", REDIRECT_SCORE, "runtime_offsite_redirect", "JavaScript navigated to an unrelated domain at runtime", "Page JavaScript set location to an unrelated domain — used to cloak content from scanners and route victims onward.", redirect, {});
211
+ }
212
+ }
213
+ if (report.evals.length) {
214
+ add("low", "medium", EVAL_SCORE, "runtime_dynamic_code", "Runtime dynamic code execution", `Page JavaScript invoked eval/Function/string-timer ${report.evals.length} time(s) at runtime.`, "eval", { count: report.evals.length });
215
+ }
216
+
217
+ // Re-scan runtime-produced content (injected markup, decoded blobs, eval'd
218
+ // code) through the static scanner so existing rules apply to it.
219
+ const derived = [...report.writes, ...report.decoded, ...report.evals];
220
+ for (const chunk of derived) {
221
+ if (!chunk || chunk.length < 8) continue;
222
+ const scanner = createScanner({ source: { url: baseUrl, contentType: "text/html" } });
223
+ scanner.feed(new TextEncoder().encode(chunk));
224
+ for (const finding of scanner.finish().findings) {
225
+ findings.push({ ...finding, id: `dyn.${finding.ruleId}:${i++}`, metadata: { ...finding.metadata, via: "dynamic_analysis" } });
226
+ }
227
+ }
228
+ return findings;
229
+ }
230
+
231
+ // URLs a page's JavaScript surfaced at runtime — redirect/navigation targets,
232
+ // fetch/XHR/script/form endpoints, and any links embedded in markup the JS
233
+ // injected (document.write / innerHTML). The crawler merges the same-origin
234
+ // ones into its frontier so it CONTINUES into JS-revealed pages instead of
235
+ // treating dynamic analysis as a dead end.
236
+ export function discoveredUrlsFromBehavior(report: BehaviorReport, baseUrl?: string): string[] {
237
+ const urls = new Set<string>();
238
+ for (const redirect of report.redirects) if (redirect) urls.add(redirect);
239
+ for (const target of report.network) if (target.url) urls.add(target.url);
240
+ for (const chunk of report.writes) {
241
+ if (!chunk || chunk.length < 4) continue;
242
+ const scanner = createScanner({ source: { url: baseUrl, contentType: "text/html" } });
243
+ scanner.feed(new TextEncoder().encode(chunk));
244
+ for (const url of scanner.finish().urls) urls.add(url.normalized);
245
+ }
246
+ return [...urls];
247
+ }
248
+
249
+ // A runtime target counts only if it's a different registrable domain than the
250
+ // page AND not a known ad/analytics/CDN host — so a site's own subdomains and
251
+ // mainstream third parties (gstatic, analytics) don't read as exfil.
252
+ function isThirdPartyTarget(url: string, baseUrl?: string): boolean {
253
+ let absolute: string;
254
+ let targetHost: string;
255
+ try {
256
+ const resolved = new URL(url, baseUrl);
257
+ if (resolved.protocol !== "http:" && resolved.protocol !== "https:") return false;
258
+ absolute = resolved.toString();
259
+ targetHost = resolved.hostname.toLowerCase();
260
+ } catch {
261
+ return false;
262
+ }
263
+ if (isAdOrAnalyticsHost(absolute)) return false;
264
+ if (!baseUrl) return true;
265
+ try {
266
+ const baseHost = new URL(baseUrl).hostname.toLowerCase();
267
+ const targetReg = registrableDomainFor(targetHost) ?? targetHost;
268
+ const baseReg = registrableDomainFor(baseHost) ?? baseHost;
269
+ return targetReg !== baseReg;
270
+ } catch {
271
+ return true;
272
+ }
273
+ }
package/src/feeds.ts ADDED
@@ -0,0 +1,334 @@
1
+ // Cached blocklist-feed index for the signal scanner.
2
+ //
3
+ // Runtime-agnostic: all persistence goes through an injected `IntelStorage`
4
+ // (R2 in a Worker, the filesystem in a CLI, an in-memory map in tests). The
5
+ // scanner never knows where bytes live.
6
+ //
7
+ // Feeds can be millions of entries, far too large to hold in a Worker, so the
8
+ // index is sharded by a stable host bucket (`shardOf`) into small files. The
9
+ // match path fetches only the few shards a scan actually needs.
10
+ //
11
+ // Evidence strength is a numeric score (the lib's native currency), decided at
12
+ // ingest and encoded by which score-band shard family a host lands in — so
13
+ // shard files stay compact host arrays. Recent/active/evidence-backed entries
14
+ // get a high score; aged/weak entries a lower one. Matching returns the highest
15
+ // band a host appears in; the caller turns that score into a finding severity
16
+ // via the usual scoring helpers.
17
+
18
+ export interface IntelStorage {
19
+ get(key: string): Promise<Uint8Array | null>;
20
+ put(key: string, value: Uint8Array): Promise<void>;
21
+ list(prefix: string): Promise<string[]>;
22
+ delete?(key: string): Promise<void>;
23
+ }
24
+
25
+ export interface FeedEntry {
26
+ host: string;
27
+ score: number;
28
+ }
29
+
30
+ export interface FeedMeta {
31
+ source?: string;
32
+ generatedAt?: string;
33
+ }
34
+
35
+ export interface FeedRecord extends FeedMeta {
36
+ version: string;
37
+ /** Distinct score bands present in this version, and the host count in each. */
38
+ bands: Record<string, number>;
39
+ }
40
+
41
+ export interface GlobalFeedManifest {
42
+ feeds: Record<string, FeedRecord>;
43
+ }
44
+
45
+ export interface CachedFeedMatch {
46
+ feedId: string;
47
+ host: string;
48
+ score: number;
49
+ source?: string;
50
+ }
51
+
52
+ /** Default score bands. Callers may use any integer band; these are conventions. */
53
+ export const FEED_SCORE_ACTIVE = 90; // live / recent / evidence-backed
54
+ export const FEED_SCORE_AGED = 55; // historical / weak / unverified
55
+
56
+ const ROOT = "feeds";
57
+ const GLOBAL_MANIFEST_KEY = `${ROOT}/manifest.json`;
58
+
59
+ const encoder = new TextEncoder();
60
+ const decoder = new TextDecoder();
61
+
62
+ /** All 256 shard prefixes ("00".."ff"); a staged build finalizes one per job. */
63
+ export const SHARD_PREFIXES: string[] = Array.from({ length: 256 }, (_, i) => i.toString(16).padStart(2, "0"));
64
+
65
+ /** Stable, synchronous host -> shard bucket. FNV-1a low byte; loader and matcher must agree. */
66
+ export function shardOf(host: string): string {
67
+ let hash = 0x811c9dc5;
68
+ const lower = host.toLowerCase();
69
+ for (let i = 0; i < lower.length; i++) {
70
+ hash ^= lower.charCodeAt(i);
71
+ hash = Math.imul(hash, 0x01000193);
72
+ }
73
+ return ((hash >>> 0) & 0xff).toString(16).padStart(2, "0");
74
+ }
75
+
76
+ /** Score a feed entry from its evidence: active/recent => high, else aged/weak. */
77
+ export function scoreFor(input: {
78
+ active?: boolean;
79
+ addedAt?: string | null;
80
+ recencyDays?: number;
81
+ now?: number;
82
+ activeScore?: number;
83
+ agedScore?: number;
84
+ }): number {
85
+ const recencyDays = input.recencyDays ?? 90;
86
+ const now = input.now ?? Date.parse(new Date().toISOString());
87
+ const recent = input.addedAt ? now - Date.parse(input.addedAt) <= recencyDays * 86_400_000 : true;
88
+ return input.active !== false && recent ? input.activeScore ?? FEED_SCORE_ACTIVE : input.agedScore ?? FEED_SCORE_AGED;
89
+ }
90
+
91
+ // ---- Small feeds: full rebuild in one pass --------------------------------
92
+
93
+ /** Rebuild a feed's shard index from a complete entry set (feeds that fit in memory). */
94
+ export async function rebuildFeed(
95
+ storage: IntelStorage,
96
+ feedId: string,
97
+ version: string,
98
+ entries: FeedEntry[],
99
+ meta: FeedMeta = {}
100
+ ): Promise<FeedRecord> {
101
+ const buckets = bucketEntries(dedupeEntries(entries));
102
+ const bands: Record<string, number> = {};
103
+ for (const [band, prefixes] of buckets) {
104
+ for (const [prefix, hosts] of prefixes) {
105
+ await putJson(storage, shardKey(feedId, version, band, prefix), [...hosts]);
106
+ bands[band] = (bands[band] ?? 0) + hosts.size;
107
+ }
108
+ }
109
+ return finalizeFeed(storage, feedId, version, bands, meta);
110
+ }
111
+
112
+ // ---- Huge feeds: staged chunk + per-shard merge ---------------------------
113
+
114
+ /** Write one download chunk's parsed entries to staging. Safe to run many in parallel. */
115
+ export async function writeFeedChunk(
116
+ storage: IntelStorage,
117
+ feedId: string,
118
+ version: string,
119
+ chunkId: string,
120
+ entries: FeedEntry[]
121
+ ): Promise<void> {
122
+ const buckets = bucketEntries(entries);
123
+ for (const [band, prefixes] of buckets) {
124
+ for (const [prefix, hosts] of prefixes) {
125
+ await putJson(storage, stagingKey(feedId, version, chunkId, band, prefix), [...hosts]);
126
+ }
127
+ }
128
+ }
129
+
130
+ /** Merge every chunk's partials for one (band, prefix) into the final shard. One job per shard. */
131
+ export async function finalizeFeedShard(
132
+ storage: IntelStorage,
133
+ feedId: string,
134
+ version: string,
135
+ band: number,
136
+ prefix: string
137
+ ): Promise<number> {
138
+ // Staging is keyed band/prefix/chunk, so this lists only the chunks for this
139
+ // one shard rather than scanning the whole staging tree.
140
+ const shardStaging = `${ROOT}/${feedId}/${version}/staging/${band}/${prefix}/`;
141
+ const merged = new Set<string>();
142
+ for (const key of await storage.list(shardStaging)) {
143
+ for (const host of await readArray(storage, key)) merged.add(host);
144
+ }
145
+ if (merged.size) await putJson(storage, shardKey(feedId, version, String(band), prefix), [...merged]);
146
+ return merged.size;
147
+ }
148
+
149
+ /** Publish the feed: write its manifest, register it globally, and sweep staging + old versions. */
150
+ export async function finalizeFeed(
151
+ storage: IntelStorage,
152
+ feedId: string,
153
+ version: string,
154
+ bands: Record<string, number>,
155
+ meta: FeedMeta = {}
156
+ ): Promise<FeedRecord> {
157
+ const record: FeedRecord = {
158
+ version,
159
+ bands,
160
+ source: meta.source,
161
+ generatedAt: meta.generatedAt ?? new Date().toISOString()
162
+ };
163
+ await putJson(storage, `${ROOT}/${feedId}/${version}/manifest.json`, record);
164
+
165
+ const manifest = (await readJson<GlobalFeedManifest>(storage, GLOBAL_MANIFEST_KEY)) ?? { feeds: {} };
166
+ const previous = manifest.feeds[feedId]?.version;
167
+ manifest.feeds[feedId] = record;
168
+ await putJson(storage, GLOBAL_MANIFEST_KEY, manifest);
169
+
170
+ await sweep(storage, `${ROOT}/${feedId}/${version}/staging/`);
171
+ if (previous && previous !== version) await sweep(storage, `${ROOT}/${feedId}/${previous}/`);
172
+ return record;
173
+ }
174
+
175
+ // ---- Match path -----------------------------------------------------------
176
+
177
+ /** Match candidate hosts against all published feeds, returning the highest score band per hit. */
178
+ export async function matchCachedFeeds(storage: IntelStorage, hosts: string[]): Promise<CachedFeedMatch[]> {
179
+ const manifest = await readJson<GlobalFeedManifest>(storage, GLOBAL_MANIFEST_KEY);
180
+ if (!manifest) return [];
181
+ const uniqueHosts = [...new Set(hosts.map((host) => host.toLowerCase()).filter(Boolean))];
182
+ const shardCache = new Map<string, Set<string>>();
183
+ const matches: CachedFeedMatch[] = [];
184
+
185
+ for (const [feedId, record] of Object.entries(manifest.feeds)) {
186
+ const bands = Object.keys(record.bands)
187
+ .map(Number)
188
+ .sort((a, b) => b - a); // highest score first
189
+ for (const host of uniqueHosts) {
190
+ const prefix = shardOf(host);
191
+ for (const band of bands) {
192
+ const key = shardKey(feedId, record.version, String(band), prefix);
193
+ let set = shardCache.get(key);
194
+ if (!set) {
195
+ set = new Set(await readArray(storage, key));
196
+ shardCache.set(key, set);
197
+ }
198
+ if (set.has(host)) {
199
+ matches.push({ feedId, host, score: band, source: record.source });
200
+ break; // strongest band wins for this host
201
+ }
202
+ }
203
+ }
204
+ }
205
+ return matches;
206
+ }
207
+
208
+ // ---- Parsers (lib owns the format; the app handles byte/line chunking) -----
209
+
210
+ /** Extract a lowercase host from a URL or bare host line; null for comments/blanks. */
211
+ export function hostFromLine(line: string): string | null {
212
+ const trimmed = line.trim();
213
+ if (!trimmed || trimmed.startsWith("#") || trimmed.startsWith("!")) return null;
214
+ const candidate = trimmed.includes("://") ? trimmed : `http://${trimmed.split(/\s+/)[0]}`;
215
+ try {
216
+ const host = new URL(candidate).hostname.toLowerCase();
217
+ // Real domains/IPv4 always contain a dot; reject single-label junk lines.
218
+ return host && host.includes(".") ? host : null;
219
+ } catch {
220
+ return null;
221
+ }
222
+ }
223
+
224
+ /** OpenPhish community feed: one active phishing URL per line. */
225
+ export function parseOpenPhishFeed(text: string, score = FEED_SCORE_ACTIVE): FeedEntry[] {
226
+ return dedupeEntries(linesOf(text).map(hostFromLine).filter(isHost).map((host) => ({ host, score })));
227
+ }
228
+
229
+ /** A bare domain/host blocklist (e.g. Phishing.Database lists) at a caller-chosen score. */
230
+ export function parseHostList(text: string, score: number): FeedEntry[] {
231
+ return dedupeEntries(linesOf(text).map(hostFromLine).filter(isHost).map((host) => ({ host, score })));
232
+ }
233
+
234
+ /** URLhaus CSV (id,dateadded,url,url_status,...): online+recent scores high, else aged. */
235
+ export function parseUrlhausCsv(text: string, opts: { recencyDays?: number; now?: number } = {}): FeedEntry[] {
236
+ const entries: FeedEntry[] = [];
237
+ for (const line of linesOf(text)) {
238
+ if (!line || line.startsWith("#")) continue;
239
+ const cols = parseCsvRow(line);
240
+ if (cols.length < 4) continue;
241
+ const host = hostFromLine(cols[2]);
242
+ if (!host) continue;
243
+ entries.push({ host, score: scoreFor({ active: cols[3] === "online", addedAt: cols[1], recencyDays: opts.recencyDays, now: opts.now }) });
244
+ }
245
+ return dedupeEntries(entries);
246
+ }
247
+
248
+ // ---- internals ------------------------------------------------------------
249
+
250
+ function shardKey(feedId: string, version: string, band: string, prefix: string): string {
251
+ return `${ROOT}/${feedId}/${version}/${band}/${prefix}.json`;
252
+ }
253
+
254
+ function stagingKey(feedId: string, version: string, chunkId: string, band: string, prefix: string): string {
255
+ // band/prefix first so a single shard's chunks share a narrow list prefix.
256
+ return `${ROOT}/${feedId}/${version}/staging/${band}/${prefix}/${chunkId}.json`;
257
+ }
258
+
259
+ // score band (string) -> shard prefix -> hosts
260
+ function bucketEntries(entries: FeedEntry[]): Map<string, Map<string, Set<string>>> {
261
+ const buckets = new Map<string, Map<string, Set<string>>>();
262
+ for (const entry of entries) {
263
+ const host = entry.host.toLowerCase();
264
+ if (!host) continue;
265
+ const band = String(entry.score);
266
+ let prefixes = buckets.get(band);
267
+ if (!prefixes) buckets.set(band, (prefixes = new Map()));
268
+ const prefix = shardOf(host);
269
+ let set = prefixes.get(prefix);
270
+ if (!set) prefixes.set(prefix, (set = new Set()));
271
+ set.add(host);
272
+ }
273
+ return buckets;
274
+ }
275
+
276
+ function dedupeEntries(entries: FeedEntry[]): FeedEntry[] {
277
+ // Keep the strongest score when a host appears more than once.
278
+ const scoreByHost = new Map<string, number>();
279
+ for (const { host, score } of entries) {
280
+ scoreByHost.set(host, Math.max(scoreByHost.get(host) ?? 0, score));
281
+ }
282
+ return [...scoreByHost].map(([host, score]) => ({ host, score }));
283
+ }
284
+
285
+ function isHost(value: string | null): value is string {
286
+ return typeof value === "string" && value.length > 0;
287
+ }
288
+
289
+ function linesOf(text: string): string[] {
290
+ return text.split(/\r?\n/);
291
+ }
292
+
293
+ function parseCsvRow(line: string): string[] {
294
+ const cols: string[] = [];
295
+ let current = "";
296
+ let inQuotes = false;
297
+ for (let i = 0; i < line.length; i++) {
298
+ const char = line[i];
299
+ if (char === '"') {
300
+ if (inQuotes && line[i + 1] === '"') { current += '"'; i++; } else inQuotes = !inQuotes;
301
+ } else if (char === "," && !inQuotes) {
302
+ cols.push(current);
303
+ current = "";
304
+ } else {
305
+ current += char;
306
+ }
307
+ }
308
+ cols.push(current);
309
+ return cols;
310
+ }
311
+
312
+ async function putJson(storage: IntelStorage, key: string, value: unknown): Promise<void> {
313
+ await storage.put(key, encoder.encode(JSON.stringify(value)));
314
+ }
315
+
316
+ async function readJson<T>(storage: IntelStorage, key: string): Promise<T | null> {
317
+ const bytes = await storage.get(key);
318
+ if (!bytes) return null;
319
+ try {
320
+ return JSON.parse(decoder.decode(bytes)) as T;
321
+ } catch {
322
+ return null;
323
+ }
324
+ }
325
+
326
+ async function readArray(storage: IntelStorage, key: string): Promise<string[]> {
327
+ const value = await readJson<string[]>(storage, key);
328
+ return Array.isArray(value) ? value : [];
329
+ }
330
+
331
+ async function sweep(storage: IntelStorage, prefix: string): Promise<void> {
332
+ if (!storage.delete) return;
333
+ for (const key of await storage.list(prefix)) await storage.delete(key);
334
+ }