@lateos/npm-scan 0.16.4 → 0.16.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/.dockerignore +20 -20
  2. package/.husky/pre-commit +1 -1
  3. package/CHANGELOG.md +199 -199
  4. package/LICENSING.md +19 -19
  5. package/README.de.md +708 -708
  6. package/README.fr.md +707 -707
  7. package/README.ja.md +704 -704
  8. package/README.md +826 -826
  9. package/README.zh.md +708 -708
  10. package/SECURITY.md +72 -72
  11. package/backend/cra.js +68 -68
  12. package/backend/db/schema.sql +32 -32
  13. package/backend/db.js +88 -88
  14. package/backend/detectors/atk-001-lifecycle.js +17 -17
  15. package/backend/detectors/atk-002-obfusc.js +261 -261
  16. package/backend/detectors/atk-003-creds.js +13 -13
  17. package/backend/detectors/atk-004-persist.js +13 -13
  18. package/backend/detectors/atk-005-exfil.js +13 -13
  19. package/backend/detectors/atk-006-depconf.js +14 -14
  20. package/backend/detectors/atk-007-typosquat.js +34 -34
  21. package/backend/detectors/atk-008-tarball-tamper.js +91 -91
  22. package/backend/detectors/atk-009-dormant-trigger.js +62 -62
  23. package/backend/detectors/atk-010-sandbox-evasion.js +50 -50
  24. package/backend/detectors/atk-011-transitive-prop.js +76 -76
  25. package/backend/detectors/cve-2026-48710-badhost/codePattern.js +99 -99
  26. package/backend/detectors/cve-2026-48710-badhost/findings.js +105 -105
  27. package/backend/detectors/cve-2026-48710-badhost/index.js +15 -15
  28. package/backend/detectors/cve-2026-48710-badhost/manifest.js +305 -305
  29. package/backend/detectors/cve-2026-48710-badhost/transitive.js +189 -189
  30. package/backend/detectors/hf-impersonation/index.js +396 -396
  31. package/backend/detectors/hf-impersonation/jaro-winkler.js +44 -44
  32. package/backend/detectors/hf-impersonation/known-orgs.js +5 -5
  33. package/backend/detectors/hf-impersonation/simhash.js +46 -46
  34. package/backend/detectors/index.js +75 -44
  35. package/backend/detectors/megalodon/d1-workflow-scan.js +147 -147
  36. package/backend/detectors/megalodon/d2-credential-harvest.js +61 -61
  37. package/backend/detectors/megalodon/d3-publish-velocity.js +67 -67
  38. package/backend/detectors/megalodon/d4-publisher-drift.js +124 -124
  39. package/backend/detectors/megalodon/d5-bot-commit-identity.js +3 -3
  40. package/backend/detectors/megalodon/d6-date-anachronism.js +3 -3
  41. package/backend/detectors/megalodon/index.js +80 -80
  42. package/backend/detectors/megalodon/types.js +9 -9
  43. package/backend/detectors/mini-shai-hulud/d1-burst-publish.js +42 -42
  44. package/backend/detectors/mini-shai-hulud/d2-sibling-compromise.js +116 -116
  45. package/backend/detectors/mini-shai-hulud/d3-slsa-mismatch.js +72 -72
  46. package/backend/detectors/mini-shai-hulud/d4-maintainer-anomaly.js +45 -45
  47. package/backend/detectors/mini-shai-hulud/d5-ioc-check.js +95 -95
  48. package/backend/detectors/mini-shai-hulud/d6-token-exfil.js +38 -38
  49. package/backend/detectors/mini-shai-hulud/index.js +118 -118
  50. package/backend/detectors/mini-shai-hulud/iocs.json +79 -79
  51. package/backend/detectors/tier1-binary-embed.js +219 -0
  52. package/backend/detectors/tier1-infostealer.js +280 -0
  53. package/backend/detectors/tier1-lifecycle-hook.js +176 -0
  54. package/backend/detectors/tier1-metadata-spoof.js +180 -0
  55. package/backend/detectors/tier1-typosquat.js +219 -0
  56. package/backend/fetch.js +175 -175
  57. package/backend/index.js +4 -4
  58. package/backend/license.js +89 -89
  59. package/backend/lockfile.js +379 -379
  60. package/backend/pdf.js +245 -245
  61. package/backend/policy.js +193 -176
  62. package/backend/report.js +254 -254
  63. package/backend/sbom.js +66 -66
  64. package/backend/siem/cef.js +32 -32
  65. package/backend/siem/ecs.js +40 -40
  66. package/backend/siem/index.js +18 -18
  67. package/backend/siem/qradar.js +56 -56
  68. package/backend/siem/sentinel.js +27 -27
  69. package/backend/vsix-scan/detectors/activation-event-risk.js +116 -116
  70. package/backend/vsix-scan/detectors/burst-publish.js +52 -52
  71. package/backend/vsix-scan/detectors/exfil-pattern.js +88 -88
  72. package/backend/vsix-scan/detectors/known-ioc.js +105 -105
  73. package/backend/vsix-scan/detectors/orphan-commit-fetch.js +69 -69
  74. package/backend/vsix-scan/detectors/publisher-anomaly.js +70 -70
  75. package/backend/vsix-scan/index.js +183 -183
  76. package/backend/vsix-scan/marketplace-client.js +145 -145
  77. package/backend/vsix-scan/vsix-iocs.json +31 -31
  78. package/cli/cli.js +458 -458
  79. package/deploy/helm/npm-scan/Chart.yaml +21 -21
  80. package/deploy/helm/npm-scan/templates/_helpers.tpl +8 -8
  81. package/deploy/helm/npm-scan/templates/api.yaml +93 -93
  82. package/deploy/helm/npm-scan/templates/ingress.yaml +27 -27
  83. package/deploy/helm/npm-scan/templates/postgresql.yaml +66 -66
  84. package/deploy/helm/npm-scan/templates/secrets.yaml +18 -18
  85. package/deploy/helm/npm-scan/templates/worker.yaml +31 -31
  86. package/deploy/helm/npm-scan/values.byoc.yaml +74 -74
  87. package/deploy/helm/npm-scan/values.yaml +102 -102
  88. package/package.json +57 -57
  89. package/scripts/download-corpus.js +30 -30
  90. package/scripts/gen-mal-corpus.js +34 -34
  91. package/scripts/generate-campaign-fixtures.js +170 -0
  92. package/src/config/top-5000.json +87 -0
  93. package/test/fixtures/lockfiles/npm-lock.json +68 -68
  94. package/test/fixtures/lockfiles/pnpm-lock.yaml +117 -117
  95. package/test/fixtures/lockfiles/yarn.lock +103 -103
  96. package/test/fixtures/mock-data.js +69 -69
@@ -1,396 +1,396 @@
1
- import { KNOWN_HF_ORGS } from './known-orgs.js';
2
- import { jaroWinkler } from './jaro-winkler.js';
3
- import { simhash, similarity as simhashSimilarity } from './simhash.js';
4
-
5
- const HF_URL_PATTERN = /(?:huggingface\.co|hf\.co)\/([^\/\s"'>]+)\/([^\/\s"'>]+)/g;
6
- const FROM_PRETRAINED_PATTERN = /from_pretrained\(\s*["']([^"']+\/[^"']+)["']/g;
7
- const HUB_DOWNLOAD_SINGLE = /hub\.download\(\s*["']([^"']+\/[^"']+)["']/g;
8
- const HUB_DOWNLOAD_DOUBLE = /hub\.download\(\s*["']([^"']+)["']\s*,\s*["']([^"']+)["']/g;
9
-
10
- const LIFECYCLE_SCRIPTS = new Set(['postinstall', 'prepare', 'install']);
11
- const API_BASE = 'https://huggingface.co';
12
-
13
- const SEVERITY_SCORE = { none: 0, low: 1, medium: 2, high: 3, critical: 4 };
14
- const SEVERITY_LABELS = ['none', 'low', 'medium', 'high', 'critical'];
15
-
16
- const HF_ARTIFACT_LIBS = new Set(['transformers', 'diffusers', 'sentence-transformers', 'gguf', 'safetensors']);
17
- const SUSPICIOUS_EXTENSIONS = /\.(exe|msi|bat|ps1|dll)$/i;
18
-
19
- const _cache = new Map();
20
- const CACHE_TTL = 3600 * 1000;
21
- let _lastFetchTime = 0;
22
-
23
- function severityIndex(sev) {
24
- return SEVERITY_SCORE[sev] || 0;
25
- }
26
-
27
- function maxSeverity(a, b) {
28
- return severityIndex(a) >= severityIndex(b) ? a : b;
29
- }
30
-
31
- function sleep(ms) {
32
- return new Promise(r => setTimeout(r, ms));
33
- }
34
-
35
- async function fetchWithCache(url) {
36
- const cached = _cache.get(url);
37
- if (cached && Date.now() - cached.fetchedAt < CACHE_TTL) {
38
- return cached.data;
39
- }
40
- const now = Date.now();
41
- const elapsed = now - _lastFetchTime;
42
- if (elapsed < 100) {
43
- await sleep(100 - elapsed);
44
- }
45
- _lastFetchTime = Date.now();
46
- let res;
47
- try {
48
- res = await fetch(url);
49
- if (res.status === 429) {
50
- const retryAfter = parseInt(res.headers.get('Retry-After') || '5', 10);
51
- await sleep(retryAfter * 1000);
52
- res = await fetch(url);
53
- }
54
- if (!res.ok) {
55
- console.debug(`HF API warning: ${url} returned ${res.status}`);
56
- return null;
57
- }
58
- const data = await res.json();
59
- _cache.set(url, { data, fetchedAt: Date.now() });
60
- return data;
61
- } catch (err) {
62
- console.debug(`HF API warning: ${err.message}`);
63
- return null;
64
- }
65
- }
66
-
67
- async function fetchReadme(url) {
68
- const cached = _cache.get(url);
69
- if (cached && Date.now() - cached.fetchedAt < CACHE_TTL) {
70
- return cached.data;
71
- }
72
- const now = Date.now();
73
- const elapsed = now - _lastFetchTime;
74
- if (elapsed < 100) {
75
- await sleep(100 - elapsed);
76
- }
77
- _lastFetchTime = Date.now();
78
- try {
79
- const res = await fetch(url);
80
- if (res.status === 429) {
81
- const retryAfter = parseInt(res.headers.get('Retry-After') || '5', 10);
82
- await sleep(retryAfter * 1000);
83
- const retryRes = await fetch(url);
84
- if (!retryRes.ok) return null;
85
- const text = await retryRes.text();
86
- _cache.set(url, { data: text, fetchedAt: Date.now() });
87
- return text;
88
- }
89
- if (!res.ok) return null;
90
- const text = await res.text();
91
- _cache.set(url, { data: text, fetchedAt: Date.now() });
92
- return text;
93
- } catch (err) {
94
- console.debug(`HF README warning: ${err.message}`);
95
- return null;
96
- }
97
- }
98
-
99
- function findClosestOrg(spoofedOrg) {
100
- const lowerOrg = String(spoofedOrg).toLowerCase();
101
- let best = { org: null, score: 0 };
102
- for (const known of KNOWN_HF_ORGS) {
103
- const score = jaroWinkler(lowerOrg, known.toLowerCase());
104
- if (score >= 0.82 && score > best.score) {
105
- best = { org: known, score };
106
- }
107
- }
108
- return best;
109
- }
110
-
111
- function extractHFTuples(pkgJson, allFiles) {
112
- const tuples = new Set();
113
- let postinstallFetchFlag = false;
114
-
115
- const scripts = pkgJson?.scripts || {};
116
- let m;
117
- for (const [hook, script] of Object.entries(scripts)) {
118
- if (typeof script !== 'string') continue;
119
-
120
- HF_URL_PATTERN.lastIndex = 0;
121
- while ((m = HF_URL_PATTERN.exec(script)) !== null) {
122
- tuples.add(`${m[1]}/${m[2]}`);
123
- if (LIFECYCLE_SCRIPTS.has(hook)) {
124
- postinstallFetchFlag = true;
125
- }
126
- }
127
-
128
- FROM_PRETRAINED_PATTERN.lastIndex = 0;
129
- while ((m = FROM_PRETRAINED_PATTERN.exec(script)) !== null) {
130
- tuples.add(m[1]);
131
- if (LIFECYCLE_SCRIPTS.has(hook)) {
132
- postinstallFetchFlag = true;
133
- }
134
- }
135
-
136
- HUB_DOWNLOAD_SINGLE.lastIndex = 0;
137
- while ((m = HUB_DOWNLOAD_SINGLE.exec(script)) !== null) {
138
- tuples.add(m[1]);
139
- if (LIFECYCLE_SCRIPTS.has(hook)) {
140
- postinstallFetchFlag = true;
141
- }
142
- }
143
-
144
- HUB_DOWNLOAD_DOUBLE.lastIndex = 0;
145
- while ((m = HUB_DOWNLOAD_DOUBLE.exec(script)) !== null) {
146
- tuples.add(`${m[1]}/${m[2]}`);
147
- if (LIFECYCLE_SCRIPTS.has(hook)) {
148
- postinstallFetchFlag = true;
149
- }
150
- }
151
- }
152
-
153
- if (allFiles) {
154
- for (const file of allFiles) {
155
- if (!file.path?.match(/\.(js|ts|jsx|tsx|mjs|cjs)$/i)) continue;
156
- const content = typeof file.content === 'string' ? file.content : '';
157
-
158
- HF_URL_PATTERN.lastIndex = 0;
159
- while ((m = HF_URL_PATTERN.exec(content)) !== null) {
160
- tuples.add(`${m[1]}/${m[2]}`);
161
- }
162
-
163
- FROM_PRETRAINED_PATTERN.lastIndex = 0;
164
- while ((m = FROM_PRETRAINED_PATTERN.exec(content)) !== null) {
165
- tuples.add(m[1]);
166
- }
167
-
168
- HUB_DOWNLOAD_SINGLE.lastIndex = 0;
169
- while ((m = HUB_DOWNLOAD_SINGLE.exec(content)) !== null) {
170
- tuples.add(m[1]);
171
- }
172
-
173
- HUB_DOWNLOAD_DOUBLE.lastIndex = 0;
174
- while ((m = HUB_DOWNLOAD_DOUBLE.exec(content)) !== null) {
175
- tuples.add(`${m[1]}/${m[2]}`);
176
- }
177
- }
178
- }
179
-
180
- return { tuples, postinstallFetchFlag };
181
- }
182
-
183
- function buildHFOrgSpoofFinding(referencedRepo, org, canonicalOrg, similarityScore, postinstallFetchFlag, tags, hfMeta) {
184
- const finding = {
185
- id: 'HF_ORG_SPOOF',
186
- severity: 'high',
187
- title: 'HuggingFace org impersonation',
188
- description: `Repository "${referencedRepo}" references org "${org}" which is similar to known HF org "${canonicalOrg.org}" (similarity: ${similarityScore.toFixed(3)})`,
189
- evidence: JSON.stringify({
190
- referencedRepo,
191
- canonicalOrg: canonicalOrg.org,
192
- similarityScore,
193
- tags: tags || [],
194
- }),
195
- referencedRepo,
196
- canonicalOrg: canonicalOrg.org,
197
- similarityScore,
198
- tags: tags || [],
199
- ipiClass: 'SUPPLY_CHAIN',
200
- };
201
- if (hfMeta) {
202
- finding.hfMeta = hfMeta;
203
- }
204
- return finding;
205
- }
206
-
207
- async function runStage2(spoofFindings, orgsToCheck, postinstallFetchFlag) {
208
- const newFindings = [];
209
-
210
- for (const [referencedRepo, { org, canonicalOrg, similarityScore, finding }] of orgsToCheck) {
211
- const tags = [];
212
- let hfMeta = null;
213
-
214
- const modelUrl = `${API_BASE}/api/models/${referencedRepo}`;
215
- const canonicalUrl = canonicalOrg.org !== org ? `${API_BASE}/api/models/${canonicalOrg.org}/${referencedRepo.split('/')[1]}` : null;
216
- const userUrl = `${API_BASE}/api/users/${org}`;
217
-
218
- const spoofedModel = await fetchWithCache(modelUrl);
219
- const canonicalModel = canonicalUrl ? await fetchWithCache(canonicalUrl) : null;
220
- const userData = await fetchWithCache(userUrl);
221
-
222
- // Org age check for NEW_ORG tag
223
- if (userData?.dateCreated) {
224
- const created = new Date(userData.dateCreated);
225
- const ageDays = (Date.now() - created.getTime()) / (1000 * 60 * 60 * 24);
226
- hfMeta = {
227
- orgAgeDays: Math.round(ageDays),
228
- repoDownloads: spoofedModel?.downloads ?? 0,
229
- };
230
- if (ageDays < 30) {
231
- tags.push('NEW_ORG');
232
- }
233
- }
234
-
235
- // README clone check
236
- if (canonicalOrg.org !== org) {
237
- const readmeSpoof = await fetchReadme(`${API_BASE}/${referencedRepo}/resolve/main/README.md`);
238
- const readmeCanonical = await fetchReadme(`${API_BASE}/${canonicalOrg.org}/${referencedRepo.split('/')[1]}/resolve/main/README.md`);
239
-
240
- if (readmeSpoof && readmeCanonical) {
241
- const fp1 = simhash(readmeSpoof);
242
- const fp2 = simhash(readmeCanonical);
243
- const simScore = simhashSimilarity(fp1, fp2);
244
-
245
- if (simScore >= 0.9) {
246
- const readmeFinding = {
247
- id: 'HF_README_CLONE',
248
- severity: 'high',
249
- title: 'HuggingFace README clone',
250
- description: `README of "${referencedRepo}" is highly similar (${(simScore * 100).toFixed(1)}%) to canonical org "${canonicalOrg.org}/${referencedRepo.split('/')[1]}"`,
251
- evidence: JSON.stringify({
252
- referencedRepo,
253
- canonicalOrg: canonicalOrg.org,
254
- similarityScore: simScore,
255
- tags: [],
256
- }),
257
- referencedRepo,
258
- canonicalOrg: canonicalOrg.org,
259
- similarityScore: simScore,
260
- tags: [],
261
- ipiClass: 'SUPPLY_CHAIN',
262
- };
263
- if (hfMeta) readmeFinding.hfMeta = hfMeta;
264
- newFindings.push(readmeFinding);
265
- }
266
- }
267
- }
268
-
269
- // Artifact mismatch check
270
- if (spoofedModel?.cardData?.library_name && spoofedModel?.siblings) {
271
- const libName = spoofedModel.cardData.library_name;
272
- if (HF_ARTIFACT_LIBS.has(libName)) {
273
- for (const sibling of spoofedModel.siblings) {
274
- const fn = sibling.rfilename || '';
275
- if (SUSPICIOUS_EXTENSIONS.test(fn)) {
276
- const artifactFinding = {
277
- id: 'HF_ARTIFACT_MISMATCH',
278
- severity: 'critical',
279
- title: 'HF artifact mismatch — suspicious binary in model repo',
280
- description: `Model "${referencedRepo}" declares library "${libName}" but contains suspicious file "${fn}"`,
281
- evidence: JSON.stringify({
282
- referencedRepo,
283
- artifactConflict: { declaredType: libName, suspiciousFilename: fn },
284
- tags: [],
285
- }),
286
- referencedRepo,
287
- artifactConflict: { declaredType: libName, suspiciousFilename: fn },
288
- tags: [],
289
- ipiClass: 'SUPPLY_CHAIN',
290
- };
291
- if (hfMeta) artifactFinding.hfMeta = hfMeta;
292
- newFindings.push(artifactFinding);
293
- break;
294
- }
295
- }
296
- }
297
- }
298
-
299
- // Apply NEW_ORG and POSTINSTALL_FETCH tags to all findings for this repo
300
- const repoSpoofFindings = spoofFindings.filter(f => f.referencedRepo === referencedRepo);
301
- for (const sf of repoSpoofFindings) {
302
- if (tags.length > 0) {
303
- if (!sf.tags) sf.tags = [];
304
- for (const t of tags) {
305
- if (!sf.tags.includes(t)) sf.tags.push(t);
306
- }
307
- }
308
- if (hfMeta) {
309
- sf.hfMeta = hfMeta;
310
- }
311
- }
312
- for (const nf of newFindings) {
313
- if (nf.referencedRepo === referencedRepo) {
314
- if (tags.length > 0) {
315
- if (!nf.tags) nf.tags = [];
316
- for (const t of tags) {
317
- if (!nf.tags.includes(t)) nf.tags.push(t);
318
- }
319
- }
320
- }
321
- }
322
- }
323
-
324
- // POSTINSTALL_FETCH escalation
325
- if (postinstallFetchFlag) {
326
- const allStage2Findings = [...spoofFindings, ...newFindings];
327
- const escalatedRepos = new Set();
328
- for (const f of allStage2Findings) {
329
- if (f.referencedRepo) escalatedRepos.add(f.referencedRepo);
330
- }
331
- for (const f of allStage2Findings) {
332
- if (escalatedRepos.has(f.referencedRepo)) {
333
- if (severityIndex(f.severity) < severityIndex('critical')) {
334
- f.severity = 'critical';
335
- }
336
- if (!f.tags) f.tags = [];
337
- if (!f.tags.includes('POSTINSTALL_ESCALATED')) {
338
- f.tags.push('POSTINSTALL_ESCALATED');
339
- }
340
- }
341
- }
342
- }
343
-
344
- return newFindings;
345
- }
346
-
347
- export async function scan(pkgJson, files = [], registryMeta = null, allFiles = null) {
348
- const { tuples, postinstallFetchFlag } = extractHFTuples(pkgJson, allFiles || files);
349
-
350
- if (tuples.size === 0) return [];
351
-
352
- // Stage 1: org spoof detection (local only)
353
- const spoofFindings = [];
354
- const orgsToCheck = []; // [referencedRepo, { org, canonicalOrg, similarityScore, finding }]
355
-
356
- for (const tuple of tuples) {
357
- const parts = tuple.split('/');
358
- if (parts.length < 2) continue;
359
- const org = parts[0];
360
-
361
- const canonicalOrg = findClosestOrg(org);
362
- if (!canonicalOrg.org) continue;
363
- if (org.toLowerCase() === canonicalOrg.org.toLowerCase()) continue;
364
-
365
- const finding = buildHFOrgSpoofFinding(tuple, org, canonicalOrg, canonicalOrg.score, postinstallFetchFlag, []);
366
- spoofFindings.push(finding);
367
- orgsToCheck.push([tuple, { org, canonicalOrg, similarityScore: canonicalOrg.score, finding }]);
368
- }
369
-
370
- if (spoofFindings.length === 0) return [];
371
-
372
- // Stage 2: network checks
373
- const stage2Findings = await runStage2(spoofFindings, orgsToCheck, postinstallFetchFlag);
374
-
375
- // Deduplicate POSTINSTALL_ESCALATED tag in evidence
376
- for (const f of [...spoofFindings, ...stage2Findings]) {
377
- if (f.tags && f.tags.length > 0) {
378
- try {
379
- const ev = JSON.parse(f.evidence);
380
- ev.tags = [...f.tags];
381
- f.evidence = JSON.stringify(ev);
382
- } catch {
383
- // evidence wasn't JSON, leave as-is
384
- }
385
- }
386
- }
387
-
388
- return [...spoofFindings, ...stage2Findings];
389
- }
390
-
391
- export function clearCache() {
392
- _cache.clear();
393
- _lastFetchTime = 0;
394
- }
395
-
396
- export { KNOWN_HF_ORGS, jaroWinkler, simhash, simhashSimilarity };
1
+ import { KNOWN_HF_ORGS } from './known-orgs.js';
2
+ import { jaroWinkler } from './jaro-winkler.js';
3
+ import { simhash, similarity as simhashSimilarity } from './simhash.js';
4
+
5
+ const HF_URL_PATTERN = /(?:huggingface\.co|hf\.co)\/([^\/\s"'>]+)\/([^\/\s"'>]+)/g;
6
+ const FROM_PRETRAINED_PATTERN = /from_pretrained\(\s*["']([^"']+\/[^"']+)["']/g;
7
+ const HUB_DOWNLOAD_SINGLE = /hub\.download\(\s*["']([^"']+\/[^"']+)["']/g;
8
+ const HUB_DOWNLOAD_DOUBLE = /hub\.download\(\s*["']([^"']+)["']\s*,\s*["']([^"']+)["']/g;
9
+
10
+ const LIFECYCLE_SCRIPTS = new Set(['postinstall', 'prepare', 'install']);
11
+ const API_BASE = 'https://huggingface.co';
12
+
13
+ const SEVERITY_SCORE = { none: 0, low: 1, medium: 2, high: 3, critical: 4 };
14
+ const SEVERITY_LABELS = ['none', 'low', 'medium', 'high', 'critical'];
15
+
16
+ const HF_ARTIFACT_LIBS = new Set(['transformers', 'diffusers', 'sentence-transformers', 'gguf', 'safetensors']);
17
+ const SUSPICIOUS_EXTENSIONS = /\.(exe|msi|bat|ps1|dll)$/i;
18
+
19
+ const _cache = new Map();
20
+ const CACHE_TTL = 3600 * 1000;
21
+ let _lastFetchTime = 0;
22
+
23
+ function severityIndex(sev) {
24
+ return SEVERITY_SCORE[sev] || 0;
25
+ }
26
+
27
+ function maxSeverity(a, b) {
28
+ return severityIndex(a) >= severityIndex(b) ? a : b;
29
+ }
30
+
31
+ function sleep(ms) {
32
+ return new Promise(r => setTimeout(r, ms));
33
+ }
34
+
35
+ async function fetchWithCache(url) {
36
+ const cached = _cache.get(url);
37
+ if (cached && Date.now() - cached.fetchedAt < CACHE_TTL) {
38
+ return cached.data;
39
+ }
40
+ const now = Date.now();
41
+ const elapsed = now - _lastFetchTime;
42
+ if (elapsed < 100) {
43
+ await sleep(100 - elapsed);
44
+ }
45
+ _lastFetchTime = Date.now();
46
+ let res;
47
+ try {
48
+ res = await fetch(url);
49
+ if (res.status === 429) {
50
+ const retryAfter = parseInt(res.headers.get('Retry-After') || '5', 10);
51
+ await sleep(retryAfter * 1000);
52
+ res = await fetch(url);
53
+ }
54
+ if (!res.ok) {
55
+ console.debug(`HF API warning: ${url} returned ${res.status}`);
56
+ return null;
57
+ }
58
+ const data = await res.json();
59
+ _cache.set(url, { data, fetchedAt: Date.now() });
60
+ return data;
61
+ } catch (err) {
62
+ console.debug(`HF API warning: ${err.message}`);
63
+ return null;
64
+ }
65
+ }
66
+
67
+ async function fetchReadme(url) {
68
+ const cached = _cache.get(url);
69
+ if (cached && Date.now() - cached.fetchedAt < CACHE_TTL) {
70
+ return cached.data;
71
+ }
72
+ const now = Date.now();
73
+ const elapsed = now - _lastFetchTime;
74
+ if (elapsed < 100) {
75
+ await sleep(100 - elapsed);
76
+ }
77
+ _lastFetchTime = Date.now();
78
+ try {
79
+ const res = await fetch(url);
80
+ if (res.status === 429) {
81
+ const retryAfter = parseInt(res.headers.get('Retry-After') || '5', 10);
82
+ await sleep(retryAfter * 1000);
83
+ const retryRes = await fetch(url);
84
+ if (!retryRes.ok) return null;
85
+ const text = await retryRes.text();
86
+ _cache.set(url, { data: text, fetchedAt: Date.now() });
87
+ return text;
88
+ }
89
+ if (!res.ok) return null;
90
+ const text = await res.text();
91
+ _cache.set(url, { data: text, fetchedAt: Date.now() });
92
+ return text;
93
+ } catch (err) {
94
+ console.debug(`HF README warning: ${err.message}`);
95
+ return null;
96
+ }
97
+ }
98
+
99
+ function findClosestOrg(spoofedOrg) {
100
+ const lowerOrg = String(spoofedOrg).toLowerCase();
101
+ let best = { org: null, score: 0 };
102
+ for (const known of KNOWN_HF_ORGS) {
103
+ const score = jaroWinkler(lowerOrg, known.toLowerCase());
104
+ if (score >= 0.82 && score > best.score) {
105
+ best = { org: known, score };
106
+ }
107
+ }
108
+ return best;
109
+ }
110
+
111
+ function extractHFTuples(pkgJson, allFiles) {
112
+ const tuples = new Set();
113
+ let postinstallFetchFlag = false;
114
+
115
+ const scripts = pkgJson?.scripts || {};
116
+ let m;
117
+ for (const [hook, script] of Object.entries(scripts)) {
118
+ if (typeof script !== 'string') continue;
119
+
120
+ HF_URL_PATTERN.lastIndex = 0;
121
+ while ((m = HF_URL_PATTERN.exec(script)) !== null) {
122
+ tuples.add(`${m[1]}/${m[2]}`);
123
+ if (LIFECYCLE_SCRIPTS.has(hook)) {
124
+ postinstallFetchFlag = true;
125
+ }
126
+ }
127
+
128
+ FROM_PRETRAINED_PATTERN.lastIndex = 0;
129
+ while ((m = FROM_PRETRAINED_PATTERN.exec(script)) !== null) {
130
+ tuples.add(m[1]);
131
+ if (LIFECYCLE_SCRIPTS.has(hook)) {
132
+ postinstallFetchFlag = true;
133
+ }
134
+ }
135
+
136
+ HUB_DOWNLOAD_SINGLE.lastIndex = 0;
137
+ while ((m = HUB_DOWNLOAD_SINGLE.exec(script)) !== null) {
138
+ tuples.add(m[1]);
139
+ if (LIFECYCLE_SCRIPTS.has(hook)) {
140
+ postinstallFetchFlag = true;
141
+ }
142
+ }
143
+
144
+ HUB_DOWNLOAD_DOUBLE.lastIndex = 0;
145
+ while ((m = HUB_DOWNLOAD_DOUBLE.exec(script)) !== null) {
146
+ tuples.add(`${m[1]}/${m[2]}`);
147
+ if (LIFECYCLE_SCRIPTS.has(hook)) {
148
+ postinstallFetchFlag = true;
149
+ }
150
+ }
151
+ }
152
+
153
+ if (allFiles) {
154
+ for (const file of allFiles) {
155
+ if (!file.path?.match(/\.(js|ts|jsx|tsx|mjs|cjs)$/i)) continue;
156
+ const content = typeof file.content === 'string' ? file.content : '';
157
+
158
+ HF_URL_PATTERN.lastIndex = 0;
159
+ while ((m = HF_URL_PATTERN.exec(content)) !== null) {
160
+ tuples.add(`${m[1]}/${m[2]}`);
161
+ }
162
+
163
+ FROM_PRETRAINED_PATTERN.lastIndex = 0;
164
+ while ((m = FROM_PRETRAINED_PATTERN.exec(content)) !== null) {
165
+ tuples.add(m[1]);
166
+ }
167
+
168
+ HUB_DOWNLOAD_SINGLE.lastIndex = 0;
169
+ while ((m = HUB_DOWNLOAD_SINGLE.exec(content)) !== null) {
170
+ tuples.add(m[1]);
171
+ }
172
+
173
+ HUB_DOWNLOAD_DOUBLE.lastIndex = 0;
174
+ while ((m = HUB_DOWNLOAD_DOUBLE.exec(content)) !== null) {
175
+ tuples.add(`${m[1]}/${m[2]}`);
176
+ }
177
+ }
178
+ }
179
+
180
+ return { tuples, postinstallFetchFlag };
181
+ }
182
+
183
+ function buildHFOrgSpoofFinding(referencedRepo, org, canonicalOrg, similarityScore, postinstallFetchFlag, tags, hfMeta) {
184
+ const finding = {
185
+ id: 'HF_ORG_SPOOF',
186
+ severity: 'high',
187
+ title: 'HuggingFace org impersonation',
188
+ description: `Repository "${referencedRepo}" references org "${org}" which is similar to known HF org "${canonicalOrg.org}" (similarity: ${similarityScore.toFixed(3)})`,
189
+ evidence: JSON.stringify({
190
+ referencedRepo,
191
+ canonicalOrg: canonicalOrg.org,
192
+ similarityScore,
193
+ tags: tags || [],
194
+ }),
195
+ referencedRepo,
196
+ canonicalOrg: canonicalOrg.org,
197
+ similarityScore,
198
+ tags: tags || [],
199
+ ipiClass: 'SUPPLY_CHAIN',
200
+ };
201
+ if (hfMeta) {
202
+ finding.hfMeta = hfMeta;
203
+ }
204
+ return finding;
205
+ }
206
+
207
+ async function runStage2(spoofFindings, orgsToCheck, postinstallFetchFlag) {
208
+ const newFindings = [];
209
+
210
+ for (const [referencedRepo, { org, canonicalOrg, similarityScore, finding }] of orgsToCheck) {
211
+ const tags = [];
212
+ let hfMeta = null;
213
+
214
+ const modelUrl = `${API_BASE}/api/models/${referencedRepo}`;
215
+ const canonicalUrl = canonicalOrg.org !== org ? `${API_BASE}/api/models/${canonicalOrg.org}/${referencedRepo.split('/')[1]}` : null;
216
+ const userUrl = `${API_BASE}/api/users/${org}`;
217
+
218
+ const spoofedModel = await fetchWithCache(modelUrl);
219
+ const canonicalModel = canonicalUrl ? await fetchWithCache(canonicalUrl) : null;
220
+ const userData = await fetchWithCache(userUrl);
221
+
222
+ // Org age check for NEW_ORG tag
223
+ if (userData?.dateCreated) {
224
+ const created = new Date(userData.dateCreated);
225
+ const ageDays = (Date.now() - created.getTime()) / (1000 * 60 * 60 * 24);
226
+ hfMeta = {
227
+ orgAgeDays: Math.round(ageDays),
228
+ repoDownloads: spoofedModel?.downloads ?? 0,
229
+ };
230
+ if (ageDays < 30) {
231
+ tags.push('NEW_ORG');
232
+ }
233
+ }
234
+
235
+ // README clone check
236
+ if (canonicalOrg.org !== org) {
237
+ const readmeSpoof = await fetchReadme(`${API_BASE}/${referencedRepo}/resolve/main/README.md`);
238
+ const readmeCanonical = await fetchReadme(`${API_BASE}/${canonicalOrg.org}/${referencedRepo.split('/')[1]}/resolve/main/README.md`);
239
+
240
+ if (readmeSpoof && readmeCanonical) {
241
+ const fp1 = simhash(readmeSpoof);
242
+ const fp2 = simhash(readmeCanonical);
243
+ const simScore = simhashSimilarity(fp1, fp2);
244
+
245
+ if (simScore >= 0.9) {
246
+ const readmeFinding = {
247
+ id: 'HF_README_CLONE',
248
+ severity: 'high',
249
+ title: 'HuggingFace README clone',
250
+ description: `README of "${referencedRepo}" is highly similar (${(simScore * 100).toFixed(1)}%) to canonical org "${canonicalOrg.org}/${referencedRepo.split('/')[1]}"`,
251
+ evidence: JSON.stringify({
252
+ referencedRepo,
253
+ canonicalOrg: canonicalOrg.org,
254
+ similarityScore: simScore,
255
+ tags: [],
256
+ }),
257
+ referencedRepo,
258
+ canonicalOrg: canonicalOrg.org,
259
+ similarityScore: simScore,
260
+ tags: [],
261
+ ipiClass: 'SUPPLY_CHAIN',
262
+ };
263
+ if (hfMeta) readmeFinding.hfMeta = hfMeta;
264
+ newFindings.push(readmeFinding);
265
+ }
266
+ }
267
+ }
268
+
269
+ // Artifact mismatch check
270
+ if (spoofedModel?.cardData?.library_name && spoofedModel?.siblings) {
271
+ const libName = spoofedModel.cardData.library_name;
272
+ if (HF_ARTIFACT_LIBS.has(libName)) {
273
+ for (const sibling of spoofedModel.siblings) {
274
+ const fn = sibling.rfilename || '';
275
+ if (SUSPICIOUS_EXTENSIONS.test(fn)) {
276
+ const artifactFinding = {
277
+ id: 'HF_ARTIFACT_MISMATCH',
278
+ severity: 'critical',
279
+ title: 'HF artifact mismatch — suspicious binary in model repo',
280
+ description: `Model "${referencedRepo}" declares library "${libName}" but contains suspicious file "${fn}"`,
281
+ evidence: JSON.stringify({
282
+ referencedRepo,
283
+ artifactConflict: { declaredType: libName, suspiciousFilename: fn },
284
+ tags: [],
285
+ }),
286
+ referencedRepo,
287
+ artifactConflict: { declaredType: libName, suspiciousFilename: fn },
288
+ tags: [],
289
+ ipiClass: 'SUPPLY_CHAIN',
290
+ };
291
+ if (hfMeta) artifactFinding.hfMeta = hfMeta;
292
+ newFindings.push(artifactFinding);
293
+ break;
294
+ }
295
+ }
296
+ }
297
+ }
298
+
299
+ // Apply NEW_ORG and POSTINSTALL_FETCH tags to all findings for this repo
300
+ const repoSpoofFindings = spoofFindings.filter(f => f.referencedRepo === referencedRepo);
301
+ for (const sf of repoSpoofFindings) {
302
+ if (tags.length > 0) {
303
+ if (!sf.tags) sf.tags = [];
304
+ for (const t of tags) {
305
+ if (!sf.tags.includes(t)) sf.tags.push(t);
306
+ }
307
+ }
308
+ if (hfMeta) {
309
+ sf.hfMeta = hfMeta;
310
+ }
311
+ }
312
+ for (const nf of newFindings) {
313
+ if (nf.referencedRepo === referencedRepo) {
314
+ if (tags.length > 0) {
315
+ if (!nf.tags) nf.tags = [];
316
+ for (const t of tags) {
317
+ if (!nf.tags.includes(t)) nf.tags.push(t);
318
+ }
319
+ }
320
+ }
321
+ }
322
+ }
323
+
324
+ // POSTINSTALL_FETCH escalation
325
+ if (postinstallFetchFlag) {
326
+ const allStage2Findings = [...spoofFindings, ...newFindings];
327
+ const escalatedRepos = new Set();
328
+ for (const f of allStage2Findings) {
329
+ if (f.referencedRepo) escalatedRepos.add(f.referencedRepo);
330
+ }
331
+ for (const f of allStage2Findings) {
332
+ if (escalatedRepos.has(f.referencedRepo)) {
333
+ if (severityIndex(f.severity) < severityIndex('critical')) {
334
+ f.severity = 'critical';
335
+ }
336
+ if (!f.tags) f.tags = [];
337
+ if (!f.tags.includes('POSTINSTALL_ESCALATED')) {
338
+ f.tags.push('POSTINSTALL_ESCALATED');
339
+ }
340
+ }
341
+ }
342
+ }
343
+
344
+ return newFindings;
345
+ }
346
+
347
+ export async function scan(pkgJson, files = [], registryMeta = null, allFiles = null) {
348
+ const { tuples, postinstallFetchFlag } = extractHFTuples(pkgJson, allFiles || files);
349
+
350
+ if (tuples.size === 0) return [];
351
+
352
+ // Stage 1: org spoof detection (local only)
353
+ const spoofFindings = [];
354
+ const orgsToCheck = []; // [referencedRepo, { org, canonicalOrg, similarityScore, finding }]
355
+
356
+ for (const tuple of tuples) {
357
+ const parts = tuple.split('/');
358
+ if (parts.length < 2) continue;
359
+ const org = parts[0];
360
+
361
+ const canonicalOrg = findClosestOrg(org);
362
+ if (!canonicalOrg.org) continue;
363
+ if (org.toLowerCase() === canonicalOrg.org.toLowerCase()) continue;
364
+
365
+ const finding = buildHFOrgSpoofFinding(tuple, org, canonicalOrg, canonicalOrg.score, postinstallFetchFlag, []);
366
+ spoofFindings.push(finding);
367
+ orgsToCheck.push([tuple, { org, canonicalOrg, similarityScore: canonicalOrg.score, finding }]);
368
+ }
369
+
370
+ if (spoofFindings.length === 0) return [];
371
+
372
+ // Stage 2: network checks
373
+ const stage2Findings = await runStage2(spoofFindings, orgsToCheck, postinstallFetchFlag);
374
+
375
+ // Deduplicate POSTINSTALL_ESCALATED tag in evidence
376
+ for (const f of [...spoofFindings, ...stage2Findings]) {
377
+ if (f.tags && f.tags.length > 0) {
378
+ try {
379
+ const ev = JSON.parse(f.evidence);
380
+ ev.tags = [...f.tags];
381
+ f.evidence = JSON.stringify(ev);
382
+ } catch {
383
+ // evidence wasn't JSON, leave as-is
384
+ }
385
+ }
386
+ }
387
+
388
+ return [...spoofFindings, ...stage2Findings];
389
+ }
390
+
391
+ export function clearCache() {
392
+ _cache.clear();
393
+ _lastFetchTime = 0;
394
+ }
395
+
396
+ export { KNOWN_HF_ORGS, jaroWinkler, simhash, simhashSimilarity };