@q32/signal-scanner 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +201 -0
- package/package.json +62 -0
- package/scripts/check-coverage.ts +33 -0
- package/scripts/eval.ts +311 -0
- package/scripts/render-isolate/entry.ts +2 -0
- package/scripts/render-isolate/polyfills.ts +33 -0
- package/scripts/render-isolate/run.ts +63 -0
- package/scripts/scan.ts +612 -0
- package/src/dynamic.ts +273 -0
- package/src/feeds.ts +334 -0
- package/src/index.ts +1366 -0
- package/src/intel.ts +561 -0
- package/src/node-tls.ts +55 -0
- package/src/render.ts +233 -0
- package/src/rules/packs/binary.ts +103 -0
- package/src/rules/packs/css.ts +44 -0
- package/src/rules/packs/decoders.ts +47 -0
- package/src/rules/packs/html.ts +255 -0
- package/src/rules/packs/index.ts +76 -0
- package/src/rules/packs/script-risk.ts +236 -0
- package/src/rules/packs/source-code.ts +180 -0
- package/src/rules/packs/urls.ts +138 -0
- package/src/rules/types.ts +56 -0
package/README.md
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# @q32/signal-scanner
|
|
2
|
+
|
|
3
|
+
Static web signal scanner with bounded streaming analyzers, URL extraction,
|
|
4
|
+
rule packs, scoring, and normalized JSON reports.
|
|
5
|
+
|
|
6
|
+
`@q32/signal-scanner` is a library first. It scans HTML, JavaScript, CSS, text,
|
|
7
|
+
SVG, JSON, archives, and selected binary content without requiring a browser or a
|
|
8
|
+
complete file in memory. Applications provide bytes and optional source metadata;
|
|
9
|
+
the scanner returns findings, extracted URLs, decoded artifacts, counters, a
|
|
10
|
+
score, and a disposition.
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
npm install @q32/signal-scanner
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Library Usage
|
|
19
|
+
|
|
20
|
+
```ts
|
|
21
|
+
import { createScanner } from "@q32/signal-scanner";
|
|
22
|
+
|
|
23
|
+
const scanner = createScanner({
|
|
24
|
+
source: {
|
|
25
|
+
url: "https://example.com/login",
|
|
26
|
+
contentType: "text/html"
|
|
27
|
+
}
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
scanner.feed(chunk);
|
|
31
|
+
const report = scanner.finish();
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
The scanner keeps bounded state: rolling text windows, line/column tracking,
|
|
35
|
+
tag/script context, URL/domain inventory, decoded artifact lineage, entropy
|
|
36
|
+
windows, and signal counters.
|
|
37
|
+
|
|
38
|
+
The default scanner does not submit forms, set cookies, open network
|
|
39
|
+
connections, or import Node-only modules.
|
|
40
|
+
|
|
41
|
+
## Report Shape
|
|
42
|
+
|
|
43
|
+
`scanner.finish()` returns:
|
|
44
|
+
|
|
45
|
+
```ts
|
|
46
|
+
interface ScannerReport {
|
|
47
|
+
contentKind: "html" | "javascript" | "css" | "json" | "svg" | "text" | "unknown" | "archive" | "executable";
|
|
48
|
+
findings: Finding[];
|
|
49
|
+
urls: ExtractedUrl[];
|
|
50
|
+
artifacts: ArtifactRecord[];
|
|
51
|
+
score: number;
|
|
52
|
+
disposition: "allow" | "warn" | "review" | "block";
|
|
53
|
+
counters: Record<string, number>;
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Findings include rule id, severity, confidence, score, title, description,
|
|
58
|
+
location, and rule metadata.
|
|
59
|
+
|
|
60
|
+
## Runtime Integration
|
|
61
|
+
|
|
62
|
+
The core library is runtime-portable. It is suitable for Node, Workers, queues,
|
|
63
|
+
crawlers, upload pipelines, and other systems that can feed bytes into the
|
|
64
|
+
scanner.
|
|
65
|
+
|
|
66
|
+
Cloudflare is only relevant if you choose to embed the library in a Cloudflare
|
|
67
|
+
runtime. The package does not require Cloudflare, and the Node CLI does not use
|
|
68
|
+
Cloudflare.
|
|
69
|
+
|
|
70
|
+
### TLS Metadata
|
|
71
|
+
|
|
72
|
+
TLS analysis belongs to the scanner, but TLS collection depends on the host
|
|
73
|
+
runtime. Pass collected metadata through `source.tls` when creating a scanner:
|
|
74
|
+
|
|
75
|
+
```ts
|
|
76
|
+
const scanner = createScanner({
|
|
77
|
+
source: {
|
|
78
|
+
url: "https://example.com",
|
|
79
|
+
contentType: "text/html",
|
|
80
|
+
tls: {
|
|
81
|
+
authorized: true,
|
|
82
|
+
issuer: "O=Google Trust Services, CN=WE1",
|
|
83
|
+
subject: "CN=example.com",
|
|
84
|
+
validFrom: "Jan 1 00:00:00 2026 GMT",
|
|
85
|
+
validTo: "Mar 31 23:59:59 2026 GMT"
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
});
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Node-compatible runtimes can use the optional helper:
|
|
92
|
+
|
|
93
|
+
```ts
|
|
94
|
+
import { collectTlsMetadata } from "@q32/signal-scanner/node-tls";
|
|
95
|
+
|
|
96
|
+
const tls = await collectTlsMetadata("https://example.com", { timeoutMs: 5000 });
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
The default scanner export does not import `node:tls`, so streaming analysis
|
|
100
|
+
stays portable to runtimes that only provide fetch/body streams.
|
|
101
|
+
|
|
102
|
+
## Dynamic Rendering
|
|
103
|
+
|
|
104
|
+
The library includes optional dynamic analysis helpers for rendering HTML with
|
|
105
|
+
`linkedom`, instrumenting browser-like APIs, recording behavior, and rescanning
|
|
106
|
+
the rendered DOM.
|
|
107
|
+
|
|
108
|
+
```ts
|
|
109
|
+
import { renderAndScan } from "@q32/signal-scanner/render";
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
`renderAndScan` runs in-process by default and is intended for trusted or
|
|
113
|
+
synthetic inputs unless you provide an isolate. The included Node CLI runs this
|
|
114
|
+
render step in `isolated-vm`, so untrusted page JavaScript cannot reach host
|
|
115
|
+
`fetch`, `process`, or `fs`.
|
|
116
|
+
|
|
117
|
+
Cloudflare users can provide their own isolate or Worker-based invocation when
|
|
118
|
+
embedding the library, but that is separate from the CLI.
|
|
119
|
+
|
|
120
|
+
## Node CLI
|
|
121
|
+
|
|
122
|
+
The package includes a Node CLI for local URL checks, bounded crawling, and
|
|
123
|
+
artifact/file scanning.
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
npm run scan -- crawl https://example.com
|
|
127
|
+
npm run scan -- crawl --no-robots --parallel 10 --max-urls 50 --max-depth 2 https://example.com
|
|
128
|
+
npm run scan -- files ./samples
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
The CLI uses Node APIs for fetching and file IO. For dynamic rendering, it uses
|
|
132
|
+
`isolated-vm`.
|
|
133
|
+
|
|
134
|
+
Crawler behavior:
|
|
135
|
+
|
|
136
|
+
- GET requests only.
|
|
137
|
+
- Bounded redirects through `fetch`.
|
|
138
|
+
- Bounded bytes per response and bounded total bytes.
|
|
139
|
+
- Global URL dedupe.
|
|
140
|
+
- Registrable-domain crawl bounds for hostnames.
|
|
141
|
+
- Exact-host crawl bounds for IP-literal targets.
|
|
142
|
+
- Robots.txt and sitemap support by default.
|
|
143
|
+
|
|
144
|
+
CLI options:
|
|
145
|
+
|
|
146
|
+
- `--no-robots` skips fetching and obeying `robots.txt`. Root sitemap probes still run.
|
|
147
|
+
- `--parallel, -n <count>` sets bounded concurrent fetches. Default: `10`.
|
|
148
|
+
- `--max-urls <count>` caps globally deduped crawl URLs. Default: `128`.
|
|
149
|
+
- `--max-depth <count>` caps link-follow depth. Default: `2`.
|
|
150
|
+
- `--max-bytes <bytes>` caps bytes per response. Default: `524288`.
|
|
151
|
+
- `--max-total-bytes <bytes>` caps aggregate crawl bytes. Default: `33554432`.
|
|
152
|
+
- `--max-sitemap-urls <count>` caps accepted sitemap URLs. Default: `512`.
|
|
153
|
+
- `--timeout-ms <ms>` caps each request. Default: `10000`.
|
|
154
|
+
- `--user-agent <value>` sets the crawler user agent.
|
|
155
|
+
- `--max-file-bytes <bytes>` caps bytes per file in `files` mode.
|
|
156
|
+
|
|
157
|
+
The CLI prints a normalized JSON summary to stdout.
|
|
158
|
+
|
|
159
|
+
## Rule Coverage
|
|
160
|
+
|
|
161
|
+
- HTML signals for forms, password/payment fields, scripts, links, iframes,
|
|
162
|
+
meta refresh redirects, hidden iframe patterns, login/payment language,
|
|
163
|
+
page-model screenshot/login cues, crypto/DeFi landing language,
|
|
164
|
+
trademark-stuffed SEO titles, and technology/dependency fingerprints.
|
|
165
|
+
- JavaScript text signals for dynamic execution, DOM injection sinks, dynamic
|
|
166
|
+
script creation, decoder APIs, request APIs, redirect APIs, storage/cookie/
|
|
167
|
+
clipboard access, wallet APIs, payment input hooks, and exfiltration
|
|
168
|
+
candidates.
|
|
169
|
+
- CSS signals for remote imports/URLs, hidden/offscreen content, opacity tricks,
|
|
170
|
+
invisible overlays, and unicode-bidi tricks.
|
|
171
|
+
- URL signals for punycode login URLs, URL shorteners, private/local targets,
|
|
172
|
+
shared-hosting subdomains, suspicious TLDs, brand impersonation, generated
|
|
173
|
+
landing URLs, and download-like targets.
|
|
174
|
+
- Decoder signals for bounded base64, JavaScript hex/unicode escapes, and
|
|
175
|
+
`String.fromCharCode` literal artifacts, with recursive rescanning.
|
|
176
|
+
- Binary static signals for executable magic, declared content-type mismatch,
|
|
177
|
+
executable-stack ELF headers, IoT botnet strings, router exploit strings,
|
|
178
|
+
dropper commands, and DHT/CNC protocol strings.
|
|
179
|
+
|
|
180
|
+
## Scoring
|
|
181
|
+
|
|
182
|
+
Every rule has an explicit score model:
|
|
183
|
+
|
|
184
|
+
```ts
|
|
185
|
+
score: {
|
|
186
|
+
base: 34,
|
|
187
|
+
tags: ["credential", "phishing"],
|
|
188
|
+
repeatMultiplier: 0.25,
|
|
189
|
+
maxRepeats: 3
|
|
190
|
+
}
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
`severity` and `confidence` are report/display metadata. They do not drive
|
|
194
|
+
scoring. The scorer sums each rule's explicit `base`, applies each rule's repeat
|
|
195
|
+
policy, and then applies explicit tag-based context multipliers such as
|
|
196
|
+
credential plus suspicious hosting, wallet/payment plus exfiltration/redirect,
|
|
197
|
+
decoded artifact plus script behavior, or binary plus URL evidence.
|
|
198
|
+
|
|
199
|
+
## License
|
|
200
|
+
|
|
201
|
+
MIT
|
package/package.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@q32/signal-scanner",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Static web signal scanner with bounded streaming analyzers, URL extraction, rule packs, scoring, and normalized reports.",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"repository": {
|
|
8
|
+
"type": "git",
|
|
9
|
+
"url": "git+https://github.com/q32llc/signal-scanner.git"
|
|
10
|
+
},
|
|
11
|
+
"files": [
|
|
12
|
+
"README.md",
|
|
13
|
+
"src/",
|
|
14
|
+
"scripts/"
|
|
15
|
+
],
|
|
16
|
+
"scripts": {
|
|
17
|
+
"test": "bun test",
|
|
18
|
+
"coverage": "bun test --coverage --coverage-reporter=lcov --coverage-dir=coverage && bun scripts/check-coverage.ts coverage/lcov.info 80",
|
|
19
|
+
"coverage:report": "bun test --coverage",
|
|
20
|
+
"scan": "tsx scripts/scan.ts",
|
|
21
|
+
"eval": "NODE_USE_ENV_PROXY=1 HTTP_PROXY=\"${EVAL_PROXY_URL-}\" HTTPS_PROXY=\"${EVAL_PROXY_URL-}\" tsx --env-file-if-exists=.env scripts/eval.ts"
|
|
22
|
+
},
|
|
23
|
+
"exports": {
|
|
24
|
+
".": {
|
|
25
|
+
"types": "./src/index.ts",
|
|
26
|
+
"default": "./src/index.ts"
|
|
27
|
+
},
|
|
28
|
+
"./node-tls": {
|
|
29
|
+
"types": "./src/node-tls.ts",
|
|
30
|
+
"default": "./src/node-tls.ts"
|
|
31
|
+
},
|
|
32
|
+
"./intel": {
|
|
33
|
+
"types": "./src/intel.ts",
|
|
34
|
+
"default": "./src/intel.ts"
|
|
35
|
+
},
|
|
36
|
+
"./feeds": {
|
|
37
|
+
"types": "./src/feeds.ts",
|
|
38
|
+
"default": "./src/feeds.ts"
|
|
39
|
+
},
|
|
40
|
+
"./dynamic": {
|
|
41
|
+
"types": "./src/dynamic.ts",
|
|
42
|
+
"default": "./src/dynamic.ts"
|
|
43
|
+
},
|
|
44
|
+
"./render": {
|
|
45
|
+
"types": "./src/render.ts",
|
|
46
|
+
"default": "./src/render.ts"
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"sideEffects": false,
|
|
50
|
+
"dependencies": {
|
|
51
|
+
"htmlparser2": "^10.1.0",
|
|
52
|
+
"linkedom": "^0.18.12"
|
|
53
|
+
},
|
|
54
|
+
"devDependencies": {
|
|
55
|
+
"base-64": "^1.0.0",
|
|
56
|
+
"buffer": "^6.0.3",
|
|
57
|
+
"esbuild": "^0.28.0",
|
|
58
|
+
"fast-text-encoding": "^1.0.6",
|
|
59
|
+
"isolated-vm": "^6.1.2",
|
|
60
|
+
"whatwg-url-without-unicode": "^8.0.0-3"
|
|
61
|
+
}
|
|
62
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
|
|
3
|
+
const [, , lcovPath, thresholdArg] = process.argv;
|
|
4
|
+
const threshold = Number(thresholdArg ?? 80);
|
|
5
|
+
|
|
6
|
+
if (!lcovPath || !Number.isFinite(threshold)) {
|
|
7
|
+
console.error("Usage: bun scripts/check-coverage.ts <lcov.info> <line-threshold-percent>");
|
|
8
|
+
process.exit(2);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
const lcov = readFileSync(lcovPath, "utf8");
|
|
12
|
+
let found = 0;
|
|
13
|
+
let hit = 0;
|
|
14
|
+
|
|
15
|
+
for (const line of lcov.split(/\r?\n/)) {
|
|
16
|
+
if (line.startsWith("LF:")) found += Number(line.slice(3));
|
|
17
|
+
else if (line.startsWith("LH:")) hit += Number(line.slice(3));
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
if (!found) {
|
|
21
|
+
console.error(`No line coverage data found in ${lcovPath}`);
|
|
22
|
+
process.exit(2);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const pct = (hit / found) * 100;
|
|
26
|
+
const display = pct.toFixed(2);
|
|
27
|
+
|
|
28
|
+
if (pct < threshold) {
|
|
29
|
+
console.error(`Line coverage ${display}% is below required ${threshold}%`);
|
|
30
|
+
process.exit(1);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
console.log(`Line coverage ${display}% meets required ${threshold}%`);
|
package/scripts/eval.ts
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
// Eval harness: run the homegrown scanner over a labeled corpus of known-good
|
|
2
|
+
// and known-bad sites and measure how well it separates them.
|
|
3
|
+
//
|
|
4
|
+
// npm run eval # reuse cached bad list if fresh (<6h)
|
|
5
|
+
// npm run eval -- --refresh # re-pull a fresh live bad list
|
|
6
|
+
//
|
|
7
|
+
// Known-good is the curated corpus/good.txt. Known-bad is pulled live from
|
|
8
|
+
// OpenPhish + URLhaus (they go offline fast), probed for reachability, and
|
|
9
|
+
// cached to corpus/.bad-cache.txt. The scan path is CLI heuristics only
|
|
10
|
+
// (structural + content + dynamic JS) — NO threat-intel feeds — so this measures
|
|
11
|
+
// the homegrown detector's own discriminative power, not feed lookups.
|
|
12
|
+
|
|
13
|
+
import { readFile, writeFile } from "node:fs/promises";
|
|
14
|
+
import { resolve } from "node:path";
|
|
15
|
+
import { crawlTargets, DEFAULT_CRAWL_OPTIONS, type CrawlOptions } from "./scan";
|
|
16
|
+
import { dispositionForScore } from "../src/index";
|
|
17
|
+
|
|
18
|
+
const FLAG_THRESHOLD = 50; // score >= 50 => product surfaces suspicious/malicious
|
|
19
|
+
const TARGET_BAD = 80;
|
|
20
|
+
const SITE_CONCURRENCY = 6;
|
|
21
|
+
const CACHE_PATH = resolve("corpus/.bad-cache.txt");
|
|
22
|
+
const PHISHING_CACHE_PATH = resolve("corpus/.bad-phishing-cache.txt");
|
|
23
|
+
const CACHE_TTL_MS = 6 * 60 * 60 * 1000;
|
|
24
|
+
const MAX_FP_RATE = 0.05; // gate: at most 5% of good sites may be flagged
|
|
25
|
+
|
|
26
|
+
// Bounded per-site crawl: landing page + a shallow hop is enough to judge, and
|
|
27
|
+
// keeps a 160-site sweep tractable.
|
|
28
|
+
const CRAWL: CrawlOptions = {
|
|
29
|
+
...DEFAULT_CRAWL_OPTIONS,
|
|
30
|
+
maxUrls: 10,
|
|
31
|
+
maxDepth: 1,
|
|
32
|
+
parallel: 4,
|
|
33
|
+
robots: false,
|
|
34
|
+
timeoutMs: 8000
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
const BROWSER_UA = DEFAULT_CRAWL_OPTIONS.userAgent;
|
|
38
|
+
|
|
39
|
+
interface SiteResult {
|
|
40
|
+
url: string;
|
|
41
|
+
label: "good" | "bad";
|
|
42
|
+
score: number;
|
|
43
|
+
disposition: string;
|
|
44
|
+
pagesScanned: number;
|
|
45
|
+
topFindings: Array<{ ruleId: string; score: number }>;
|
|
46
|
+
unreachable: boolean;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// The dynamic-analysis sandbox runs untrusted page JS; a stray rejection or
|
|
50
|
+
// throw from one site must never abort a 160-site sweep. Per-site scanning is
|
|
51
|
+
// already best-effort, so swallow these and keep going.
|
|
52
|
+
process.on("unhandledRejection", () => {});
|
|
53
|
+
process.on("uncaughtException", (error) => {
|
|
54
|
+
console.error(" (ignored uncaught error from sandbox):", error instanceof Error ? error.message : error);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
async function main(): Promise<void> {
|
|
58
|
+
const refresh = process.argv.includes("--refresh");
|
|
59
|
+
// --phishing pulls a phishing-ONLY bad corpus (OpenPhish + Phishing.Database
|
|
60
|
+
// active links, no URLhaus malware binaries) to measure catch rate on
|
|
61
|
+
// malicious PAGES — where the web heuristics (credential forms, brand
|
|
62
|
+
// impersonation, cloaking) should actually shine.
|
|
63
|
+
const phishingOnly = process.argv.includes("--phishing");
|
|
64
|
+
// --live uses the curated, hand-verified corpus/phishing-live.txt (real
|
|
65
|
+
// credential-capture pages confirmed alive) instead of a noisy feed.
|
|
66
|
+
const live = process.argv.includes("--live");
|
|
67
|
+
// Egress: set EVAL_PROXY_URL (e.g. an unfiltered residential proxy) so the
|
|
68
|
+
// crawl + reachability probe leave via that proxy instead of the local
|
|
69
|
+
// network — necessary when an ISP filter (e.g. Spectrum Security Shield)
|
|
70
|
+
// intercepts known-malicious URLs and serves a block page, which would
|
|
71
|
+
// otherwise make every bad site look benign. The npm script maps it onto
|
|
72
|
+
// HTTP(S)_PROXY with NODE_USE_ENV_PROXY=1 (read at startup by node's fetch).
|
|
73
|
+
const proxy = process.env.EVAL_PROXY_URL || process.env.HTTPS_PROXY || "";
|
|
74
|
+
console.error(`egress: ${proxy ? "proxy " + redactProxy(proxy) : "direct (local network)"}`);
|
|
75
|
+
|
|
76
|
+
const good = await loadList("corpus/good.txt");
|
|
77
|
+
const bad = live ? await loadList("corpus/phishing-live.txt") : await loadBad(refresh, phishingOnly);
|
|
78
|
+
console.error(`corpus: ${good.length} good, ${bad.length} bad (${live ? "curated live" : phishingOnly ? "phishing feed" : "mixed feed"})`);
|
|
79
|
+
|
|
80
|
+
const labeled: Array<{ url: string; label: "good" | "bad" }> = [
|
|
81
|
+
...good.map((url) => ({ url, label: "good" as const })),
|
|
82
|
+
...bad.map((url) => ({ url, label: "bad" as const }))
|
|
83
|
+
];
|
|
84
|
+
|
|
85
|
+
const results: SiteResult[] = [];
|
|
86
|
+
let done = 0;
|
|
87
|
+
await pool(labeled, SITE_CONCURRENCY, async ({ url, label }) => {
|
|
88
|
+
const result = await scanSite(url, label);
|
|
89
|
+
results.push(result);
|
|
90
|
+
done += 1;
|
|
91
|
+
if (done % 10 === 0) console.error(` scanned ${done}/${labeled.length}`);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
report(results);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async function scanSite(url: string, label: "good" | "bad"): Promise<SiteResult> {
|
|
98
|
+
try {
|
|
99
|
+
const reports = await crawlTargets([url], CRAWL);
|
|
100
|
+
const scored = reports.filter((r) => !r.error && r.report);
|
|
101
|
+
if (!scored.length) {
|
|
102
|
+
return { url, label, score: 0, disposition: "allow", pagesScanned: 0, topFindings: [], unreachable: true };
|
|
103
|
+
}
|
|
104
|
+
const worst = scored.reduce((a, b) => (b.report.score > a.report.score ? b : a));
|
|
105
|
+
const score = worst.report.score;
|
|
106
|
+
const topFindings = [...worst.report.findings]
|
|
107
|
+
.sort((a, b) => (b.score ?? 0) - (a.score ?? 0))
|
|
108
|
+
.slice(0, 3)
|
|
109
|
+
.map((f) => ({ ruleId: f.ruleId, score: f.score ?? 0 }));
|
|
110
|
+
return { url, label, score, disposition: dispositionForScore(score), pagesScanned: scored.length, topFindings, unreachable: false };
|
|
111
|
+
} catch {
|
|
112
|
+
return { url, label, score: 0, disposition: "allow", pagesScanned: 0, topFindings: [], unreachable: true };
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function report(results: SiteResult[]): void {
|
|
117
|
+
const reachable = results.filter((r) => !r.unreachable);
|
|
118
|
+
const good = reachable.filter((r) => r.label === "good");
|
|
119
|
+
const bad = reachable.filter((r) => r.label === "bad");
|
|
120
|
+
const flagged = (r: SiteResult) => r.score >= FLAG_THRESHOLD;
|
|
121
|
+
|
|
122
|
+
const fp = good.filter(flagged); // good, flagged => false positive
|
|
123
|
+
const tn = good.filter((r) => !flagged(r));
|
|
124
|
+
const tp = bad.filter(flagged); // bad, flagged => caught
|
|
125
|
+
const fn = bad.filter((r) => !flagged(r)); // bad, missed
|
|
126
|
+
|
|
127
|
+
const pct = (n: number, d: number) => (d ? `${((100 * n) / d).toFixed(1)}%` : "n/a");
|
|
128
|
+
|
|
129
|
+
const proxy = process.env.EVAL_PROXY_URL || process.env.HTTPS_PROXY || "";
|
|
130
|
+
console.log("\n================ SCANNER EVAL ================");
|
|
131
|
+
console.log(`egress: ${proxy ? "proxy " + redactProxy(proxy) : "direct (local network)"}`);
|
|
132
|
+
console.log(`unreachable (excluded): ${results.filter((r) => r.unreachable).length} / ${results.length}`);
|
|
133
|
+
console.log(`\nGood sites: ${good.length} reachable`);
|
|
134
|
+
console.log(` flagged (FALSE POSITIVE): ${fp.length} [${pct(fp.length, good.length)}]`);
|
|
135
|
+
console.log(` clean (true negative): ${tn.length}`);
|
|
136
|
+
console.log(`\nBad sites: ${bad.length} reachable`);
|
|
137
|
+
console.log(` flagged (caught): ${tp.length} [recall ${pct(tp.length, bad.length)}]`);
|
|
138
|
+
console.log(` missed (false negative): ${fn.length}`);
|
|
139
|
+
|
|
140
|
+
console.log("\nScore distribution (count by band):");
|
|
141
|
+
console.log(` band good bad`);
|
|
142
|
+
for (const [lo, hi] of [[0, 9], [10, 24], [25, 49], [50, 74], [75, 100]]) {
|
|
143
|
+
const g = good.filter((r) => r.score >= lo && r.score <= hi).length;
|
|
144
|
+
const b = bad.filter((r) => r.score >= lo && r.score <= hi).length;
|
|
145
|
+
const mark = lo >= FLAG_THRESHOLD ? " <-flag" : "";
|
|
146
|
+
console.log(` ${String(lo).padStart(3)}-${String(hi).padEnd(3)} ${String(g).padStart(5)} ${String(b).padStart(5)}${mark}`);
|
|
147
|
+
}
|
|
148
|
+
console.log(` good: median ${median(good.map((r) => r.score))}, p90 ${percentile(good.map((r) => r.score), 90)}`);
|
|
149
|
+
console.log(` bad: median ${median(bad.map((r) => r.score))}, p90 ${percentile(bad.map((r) => r.score), 90)}`);
|
|
150
|
+
|
|
151
|
+
if (fp.length) {
|
|
152
|
+
console.log("\nFALSE POSITIVES (good sites flagged) — fix these:");
|
|
153
|
+
for (const r of fp.sort((a, b) => b.score - a.score)) {
|
|
154
|
+
console.log(` [${r.score}] ${r.url} ${r.topFindings.map((f) => `${f.ruleId}(${f.score})`).join(", ")}`);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
if (fn.length) {
|
|
158
|
+
console.log("\nMISSED bad sites (score < flag threshold):");
|
|
159
|
+
for (const r of fn.sort((a, b) => b.score - a.score).slice(0, 25)) {
|
|
160
|
+
console.log(` [${r.score}] ${r.url} ${r.topFindings.map((f) => `${f.ruleId}(${f.score})`).join(", ") || "(no signal)"}`);
|
|
161
|
+
}
|
|
162
|
+
if (fn.length > 25) console.log(` ... and ${fn.length - 25} more`);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const fpRate = good.length ? fp.length / good.length : 0;
|
|
166
|
+
const pass = fpRate <= MAX_FP_RATE;
|
|
167
|
+
console.log(`\nGATE: false-positive rate ${pct(fp.length, good.length)} (max ${MAX_FP_RATE * 100}%) => ${pass ? "PASS" : "FAIL"}`);
|
|
168
|
+
console.log("=============================================\n");
|
|
169
|
+
if (!pass) process.exitCode = 1;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// ---- known-bad corpus (live) --------------------------------------------
|
|
173
|
+
|
|
174
|
+
async function loadBad(refresh: boolean, phishingOnly: boolean): Promise<string[]> {
|
|
175
|
+
const cachePath = phishingOnly ? PHISHING_CACHE_PATH : CACHE_PATH;
|
|
176
|
+
if (!refresh) {
|
|
177
|
+
const cached = await readCacheIfFresh(cachePath);
|
|
178
|
+
if (cached) {
|
|
179
|
+
console.error(`using cached bad list (${cached.length} urls)`);
|
|
180
|
+
return cached;
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
console.error(`pulling live bad URLs (${phishingOnly ? "phishing-only" : "mixed"}) ...`);
|
|
184
|
+
const candidates = shuffle(dedupe(await fetchBadCandidates(phishingOnly)));
|
|
185
|
+
console.error(` ${candidates.length} candidates; probing reachability ...`);
|
|
186
|
+
const live = await probeReachable(candidates, TARGET_BAD);
|
|
187
|
+
await writeFile(cachePath, `# pulled ${new Date().toISOString()}\n${live.join("\n")}\n`, "utf8");
|
|
188
|
+
return live;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
async function readCacheIfFresh(cachePath: string): Promise<string[] | null> {
|
|
192
|
+
try {
|
|
193
|
+
const text = await readFile(cachePath, "utf8");
|
|
194
|
+
const stamp = text.match(/# pulled (.+)/)?.[1];
|
|
195
|
+
if (!stamp || Date.now() - Date.parse(stamp) > CACHE_TTL_MS) return null;
|
|
196
|
+
const urls = parseList(text);
|
|
197
|
+
return urls.length ? urls : null;
|
|
198
|
+
} catch {
|
|
199
|
+
return null;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
async function fetchBadCandidates(phishingOnly: boolean): Promise<string[]> {
|
|
204
|
+
const urls: string[] = [];
|
|
205
|
+
// OpenPhish community feed (public, ~hundreds of fresh phishing URLs).
|
|
206
|
+
try {
|
|
207
|
+
const res = await fetch("https://openphish.com/feed.txt", { signal: AbortSignal.timeout(15000) });
|
|
208
|
+
if (res.ok) urls.push(...parseList(await res.text()));
|
|
209
|
+
} catch (error) {
|
|
210
|
+
console.error(" openphish fetch failed:", error instanceof Error ? error.message : error);
|
|
211
|
+
}
|
|
212
|
+
if (phishingOnly) {
|
|
213
|
+
// Phishing.Database active links (public, large list of currently-active
|
|
214
|
+
// phishing URLs) — sampled, no auth.
|
|
215
|
+
try {
|
|
216
|
+
const res = await fetch("https://raw.githubusercontent.com/mitchellkrogza/Phishing.Database/master/phishing-links-ACTIVE.txt", { signal: AbortSignal.timeout(30000) });
|
|
217
|
+
if (res.ok) urls.push(...parseList(await res.text()).filter((u) => u.startsWith("http")).slice(0, 4000));
|
|
218
|
+
} catch (error) {
|
|
219
|
+
console.error(" phishing.database fetch failed:", error instanceof Error ? error.message : error);
|
|
220
|
+
}
|
|
221
|
+
return urls;
|
|
222
|
+
}
|
|
223
|
+
// URLhaus online URLs (malware distribution). Auth-Key used if present.
|
|
224
|
+
try {
|
|
225
|
+
const headers: Record<string, string> = {};
|
|
226
|
+
if (process.env.ABUSE_CH_AUTH_KEY) headers["Auth-Key"] = process.env.ABUSE_CH_AUTH_KEY;
|
|
227
|
+
const res = await fetch("https://urlhaus.abuse.ch/downloads/csv_online/", { headers, signal: AbortSignal.timeout(20000) });
|
|
228
|
+
if (res.ok) {
|
|
229
|
+
for (const line of (await res.text()).split("\n")) {
|
|
230
|
+
if (line.startsWith("#") || !line.trim()) continue;
|
|
231
|
+
const fields = line.split('","').map((f) => f.replace(/^"|"$/g, ""));
|
|
232
|
+
if (fields[3] === "online" && fields[2]?.startsWith("http")) urls.push(fields[2]);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
} catch (error) {
|
|
236
|
+
console.error(" urlhaus fetch failed:", error instanceof Error ? error.message : error);
|
|
237
|
+
}
|
|
238
|
+
return urls;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
async function probeReachable(candidates: string[], target: number): Promise<string[]> {
|
|
242
|
+
const live: string[] = [];
|
|
243
|
+
let i = 0;
|
|
244
|
+
await pool(candidates, 12, async (url) => {
|
|
245
|
+
if (live.length >= target) return;
|
|
246
|
+
try {
|
|
247
|
+
// A live phishing kit serves real content (200) at the URL itself. A
|
|
248
|
+
// taken-down one 404s or 301/302s to a park/block page — exclude those
|
|
249
|
+
// (status !== 200) so a dead corpus doesn't dilute recall. No body-size
|
|
250
|
+
// gate: a single <script> tag can be a complete phishing page.
|
|
251
|
+
const res = await fetch(url, { headers: { "user-agent": BROWSER_UA }, redirect: "manual", signal: AbortSignal.timeout(8000) });
|
|
252
|
+
if (res.status === 200 && live.length < target) live.push(url);
|
|
253
|
+
} catch {
|
|
254
|
+
// dead/unreachable — skip
|
|
255
|
+
}
|
|
256
|
+
i += 1;
|
|
257
|
+
});
|
|
258
|
+
return live.slice(0, target);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// ---- helpers -------------------------------------------------------------
|
|
262
|
+
|
|
263
|
+
async function loadList(path: string): Promise<string[]> {
|
|
264
|
+
return parseList(await readFile(resolve(path), "utf8"));
|
|
265
|
+
}
|
|
266
|
+
function parseList(text: string): string[] {
|
|
267
|
+
return text.split("\n").map((l) => l.trim()).filter((l) => l && !l.startsWith("#"));
|
|
268
|
+
}
|
|
269
|
+
function dedupe(values: string[]): string[] {
|
|
270
|
+
return [...new Set(values)];
|
|
271
|
+
}
|
|
272
|
+
function redactProxy(url: string): string {
|
|
273
|
+
try {
|
|
274
|
+
const u = new URL(url);
|
|
275
|
+
return `${u.hostname}:${u.port}`;
|
|
276
|
+
} catch {
|
|
277
|
+
return "set";
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
function shuffle<T>(values: T[]): T[] {
|
|
281
|
+
// Index-based jitter (no Math.random dependency needed for a rough mix).
|
|
282
|
+
return values
|
|
283
|
+
.map((v, i) => ({ v, k: (i * 2654435761) % values.length }))
|
|
284
|
+
.sort((a, b) => a.k - b.k)
|
|
285
|
+
.map((x) => x.v);
|
|
286
|
+
}
|
|
287
|
+
function median(values: number[]): number {
|
|
288
|
+
if (!values.length) return 0;
|
|
289
|
+
const s = [...values].sort((a, b) => a - b);
|
|
290
|
+
return s[Math.floor(s.length / 2)];
|
|
291
|
+
}
|
|
292
|
+
function percentile(values: number[], p: number): number {
|
|
293
|
+
if (!values.length) return 0;
|
|
294
|
+
const s = [...values].sort((a, b) => a - b);
|
|
295
|
+
return s[Math.min(s.length - 1, Math.floor((p / 100) * s.length))];
|
|
296
|
+
}
|
|
297
|
+
async function pool<T>(items: T[], concurrency: number, worker: (item: T) => Promise<void>): Promise<void> {
|
|
298
|
+
let index = 0;
|
|
299
|
+
const runners = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
300
|
+
while (index < items.length) {
|
|
301
|
+
const item = items[index++];
|
|
302
|
+
await worker(item);
|
|
303
|
+
}
|
|
304
|
+
});
|
|
305
|
+
await Promise.all(runners);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
main().catch((error) => {
|
|
309
|
+
console.error(error);
|
|
310
|
+
process.exit(1);
|
|
311
|
+
});
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
// Web globals a bare V8 isolate lacks, installed BEFORE linkedom/render load.
|
|
2
|
+
// (self/window are set by the host via context.eval before this bundle runs.)
|
|
3
|
+
import "fast-text-encoding";
|
|
4
|
+
import base64 from "base-64";
|
|
5
|
+
import { URL, URLSearchParams } from "whatwg-url-without-unicode";
|
|
6
|
+
const g = globalThis as any;
|
|
7
|
+
if (!g.atob) g.atob = (s: string) => base64.decode(String(s));
|
|
8
|
+
if (!g.btoa) g.btoa = (s: string) => base64.encode(String(s));
|
|
9
|
+
if (!g.URL) g.URL = URL;
|
|
10
|
+
if (!g.URLSearchParams) g.URLSearchParams = URLSearchParams;
|
|
11
|
+
|
|
12
|
+
// Minimal Buffer shim (linkedom's entity decoder + a few runtime paths use it).
|
|
13
|
+
// Built on the globals above; covers from(str|base64|bytes) + toString(enc).
|
|
14
|
+
if (!g.Buffer) {
|
|
15
|
+
const toBinary = (bytes: Uint8Array): string => {
|
|
16
|
+
let out = "";
|
|
17
|
+
for (let i = 0; i < bytes.length; i += 8192) out += String.fromCharCode.apply(null, bytes.subarray(i, i + 8192) as unknown as number[]);
|
|
18
|
+
return out;
|
|
19
|
+
};
|
|
20
|
+
g.Buffer = {
|
|
21
|
+
from(input: unknown, enc?: string): Uint8Array & { toString: (e?: string) => string } {
|
|
22
|
+
let bytes: Uint8Array;
|
|
23
|
+
if (input instanceof Uint8Array) bytes = input;
|
|
24
|
+
else if (enc === "base64") bytes = Uint8Array.from(g.atob(String(input)), (c: string) => c.charCodeAt(0));
|
|
25
|
+
else bytes = new TextEncoder().encode(String(input));
|
|
26
|
+
const view = bytes as Uint8Array & { toString: (e?: string) => string };
|
|
27
|
+
view.toString = (e?: string) => (e === "binary" || e === "latin1" ? toBinary(bytes) : e === "base64" ? g.btoa(toBinary(bytes)) : new TextDecoder().decode(bytes));
|
|
28
|
+
return view;
|
|
29
|
+
},
|
|
30
|
+
alloc: (n: number) => new Uint8Array(n),
|
|
31
|
+
isBuffer: (x: unknown) => x instanceof Uint8Array
|
|
32
|
+
};
|
|
33
|
+
}
|