@lateos/npm-scan 0.15.1 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,396 @@
1
+ import { KNOWN_HF_ORGS } from './known-orgs.js';
2
+ import { jaroWinkler } from './jaro-winkler.js';
3
+ import { simhash, similarity as simhashSimilarity } from './simhash.js';
4
+
5
+ const HF_URL_PATTERN = /(?:huggingface\.co|hf\.co)\/([^\/\s"'>]+)\/([^\/\s"'>]+)/g;
6
+ const FROM_PRETRAINED_PATTERN = /from_pretrained\(\s*["']([^"']+\/[^"']+)["']/g;
7
+ const HUB_DOWNLOAD_SINGLE = /hub\.download\(\s*["']([^"']+\/[^"']+)["']/g;
8
+ const HUB_DOWNLOAD_DOUBLE = /hub\.download\(\s*["']([^"']+)["']\s*,\s*["']([^"']+)["']/g;
9
+
10
+ const LIFECYCLE_SCRIPTS = new Set(['postinstall', 'prepare', 'install']);
11
+ const API_BASE = 'https://huggingface.co';
12
+
13
+ const SEVERITY_SCORE = { none: 0, low: 1, medium: 2, high: 3, critical: 4 };
14
+ const SEVERITY_LABELS = ['none', 'low', 'medium', 'high', 'critical'];
15
+
16
+ const HF_ARTIFACT_LIBS = new Set(['transformers', 'diffusers', 'sentence-transformers', 'gguf', 'safetensors']);
17
+ const SUSPICIOUS_EXTENSIONS = /\.(exe|msi|bat|ps1|dll)$/i;
18
+
19
+ const _cache = new Map();
20
+ const CACHE_TTL = 3600 * 1000;
21
+ let _lastFetchTime = 0;
22
+
23
+ function severityIndex(sev) {
24
+ return SEVERITY_SCORE[sev] || 0;
25
+ }
26
+
27
+ function maxSeverity(a, b) {
28
+ return severityIndex(a) >= severityIndex(b) ? a : b;
29
+ }
30
+
31
+ function sleep(ms) {
32
+ return new Promise(r => setTimeout(r, ms));
33
+ }
34
+
35
+ async function fetchWithCache(url) {
36
+ const cached = _cache.get(url);
37
+ if (cached && Date.now() - cached.fetchedAt < CACHE_TTL) {
38
+ return cached.data;
39
+ }
40
+ const now = Date.now();
41
+ const elapsed = now - _lastFetchTime;
42
+ if (elapsed < 100) {
43
+ await sleep(100 - elapsed);
44
+ }
45
+ _lastFetchTime = Date.now();
46
+ let res;
47
+ try {
48
+ res = await fetch(url);
49
+ if (res.status === 429) {
50
+ const retryAfter = parseInt(res.headers.get('Retry-After') || '5', 10);
51
+ await sleep(retryAfter * 1000);
52
+ res = await fetch(url);
53
+ }
54
+ if (!res.ok) {
55
+ console.debug(`HF API warning: ${url} returned ${res.status}`);
56
+ return null;
57
+ }
58
+ const data = await res.json();
59
+ _cache.set(url, { data, fetchedAt: Date.now() });
60
+ return data;
61
+ } catch (err) {
62
+ console.debug(`HF API warning: ${err.message}`);
63
+ return null;
64
+ }
65
+ }
66
+
67
+ async function fetchReadme(url) {
68
+ const cached = _cache.get(url);
69
+ if (cached && Date.now() - cached.fetchedAt < CACHE_TTL) {
70
+ return cached.data;
71
+ }
72
+ const now = Date.now();
73
+ const elapsed = now - _lastFetchTime;
74
+ if (elapsed < 100) {
75
+ await sleep(100 - elapsed);
76
+ }
77
+ _lastFetchTime = Date.now();
78
+ try {
79
+ const res = await fetch(url);
80
+ if (res.status === 429) {
81
+ const retryAfter = parseInt(res.headers.get('Retry-After') || '5', 10);
82
+ await sleep(retryAfter * 1000);
83
+ const retryRes = await fetch(url);
84
+ if (!retryRes.ok) return null;
85
+ const text = await retryRes.text();
86
+ _cache.set(url, { data: text, fetchedAt: Date.now() });
87
+ return text;
88
+ }
89
+ if (!res.ok) return null;
90
+ const text = await res.text();
91
+ _cache.set(url, { data: text, fetchedAt: Date.now() });
92
+ return text;
93
+ } catch (err) {
94
+ console.debug(`HF README warning: ${err.message}`);
95
+ return null;
96
+ }
97
+ }
98
+
99
+ function findClosestOrg(spoofedOrg) {
100
+ const lowerOrg = String(spoofedOrg).toLowerCase();
101
+ let best = { org: null, score: 0 };
102
+ for (const known of KNOWN_HF_ORGS) {
103
+ const score = jaroWinkler(lowerOrg, known.toLowerCase());
104
+ if (score >= 0.82 && score > best.score) {
105
+ best = { org: known, score };
106
+ }
107
+ }
108
+ return best;
109
+ }
110
+
111
+ function extractHFTuples(pkgJson, allFiles) {
112
+ const tuples = new Set();
113
+ let postinstallFetchFlag = false;
114
+
115
+ const scripts = pkgJson?.scripts || {};
116
+ let m;
117
+ for (const [hook, script] of Object.entries(scripts)) {
118
+ if (typeof script !== 'string') continue;
119
+
120
+ HF_URL_PATTERN.lastIndex = 0;
121
+ while ((m = HF_URL_PATTERN.exec(script)) !== null) {
122
+ tuples.add(`${m[1]}/${m[2]}`);
123
+ if (LIFECYCLE_SCRIPTS.has(hook)) {
124
+ postinstallFetchFlag = true;
125
+ }
126
+ }
127
+
128
+ FROM_PRETRAINED_PATTERN.lastIndex = 0;
129
+ while ((m = FROM_PRETRAINED_PATTERN.exec(script)) !== null) {
130
+ tuples.add(m[1]);
131
+ if (LIFECYCLE_SCRIPTS.has(hook)) {
132
+ postinstallFetchFlag = true;
133
+ }
134
+ }
135
+
136
+ HUB_DOWNLOAD_SINGLE.lastIndex = 0;
137
+ while ((m = HUB_DOWNLOAD_SINGLE.exec(script)) !== null) {
138
+ tuples.add(m[1]);
139
+ if (LIFECYCLE_SCRIPTS.has(hook)) {
140
+ postinstallFetchFlag = true;
141
+ }
142
+ }
143
+
144
+ HUB_DOWNLOAD_DOUBLE.lastIndex = 0;
145
+ while ((m = HUB_DOWNLOAD_DOUBLE.exec(script)) !== null) {
146
+ tuples.add(`${m[1]}/${m[2]}`);
147
+ if (LIFECYCLE_SCRIPTS.has(hook)) {
148
+ postinstallFetchFlag = true;
149
+ }
150
+ }
151
+ }
152
+
153
+ if (allFiles) {
154
+ for (const file of allFiles) {
155
+ if (!file.path?.match(/\.(js|ts|jsx|tsx|mjs|cjs)$/i)) continue;
156
+ const content = typeof file.content === 'string' ? file.content : '';
157
+
158
+ HF_URL_PATTERN.lastIndex = 0;
159
+ while ((m = HF_URL_PATTERN.exec(content)) !== null) {
160
+ tuples.add(`${m[1]}/${m[2]}`);
161
+ }
162
+
163
+ FROM_PRETRAINED_PATTERN.lastIndex = 0;
164
+ while ((m = FROM_PRETRAINED_PATTERN.exec(content)) !== null) {
165
+ tuples.add(m[1]);
166
+ }
167
+
168
+ HUB_DOWNLOAD_SINGLE.lastIndex = 0;
169
+ while ((m = HUB_DOWNLOAD_SINGLE.exec(content)) !== null) {
170
+ tuples.add(m[1]);
171
+ }
172
+
173
+ HUB_DOWNLOAD_DOUBLE.lastIndex = 0;
174
+ while ((m = HUB_DOWNLOAD_DOUBLE.exec(content)) !== null) {
175
+ tuples.add(`${m[1]}/${m[2]}`);
176
+ }
177
+ }
178
+ }
179
+
180
+ return { tuples, postinstallFetchFlag };
181
+ }
182
+
183
+ function buildHFOrgSpoofFinding(referencedRepo, org, canonicalOrg, similarityScore, postinstallFetchFlag, tags, hfMeta) {
184
+ const finding = {
185
+ id: 'HF_ORG_SPOOF',
186
+ severity: 'high',
187
+ title: 'HuggingFace org impersonation',
188
+ description: `Repository "${referencedRepo}" references org "${org}" which is similar to known HF org "${canonicalOrg.org}" (similarity: ${similarityScore.toFixed(3)})`,
189
+ evidence: JSON.stringify({
190
+ referencedRepo,
191
+ canonicalOrg: canonicalOrg.org,
192
+ similarityScore,
193
+ tags: tags || [],
194
+ }),
195
+ referencedRepo,
196
+ canonicalOrg: canonicalOrg.org,
197
+ similarityScore,
198
+ tags: tags || [],
199
+ ipiClass: 'SUPPLY_CHAIN',
200
+ };
201
+ if (hfMeta) {
202
+ finding.hfMeta = hfMeta;
203
+ }
204
+ return finding;
205
+ }
206
+
207
+ async function runStage2(spoofFindings, orgsToCheck, postinstallFetchFlag) {
208
+ const newFindings = [];
209
+
210
+ for (const [referencedRepo, { org, canonicalOrg, similarityScore, finding }] of orgsToCheck) {
211
+ const tags = [];
212
+ let hfMeta = null;
213
+
214
+ const modelUrl = `${API_BASE}/api/models/${referencedRepo}`;
215
+ const canonicalUrl = canonicalOrg.org !== org ? `${API_BASE}/api/models/${canonicalOrg.org}/${referencedRepo.split('/')[1]}` : null;
216
+ const userUrl = `${API_BASE}/api/users/${org}`;
217
+
218
+ const spoofedModel = await fetchWithCache(modelUrl);
219
+ const canonicalModel = canonicalUrl ? await fetchWithCache(canonicalUrl) : null;
220
+ const userData = await fetchWithCache(userUrl);
221
+
222
+ // Org age check for NEW_ORG tag
223
+ if (userData?.dateCreated) {
224
+ const created = new Date(userData.dateCreated);
225
+ const ageDays = (Date.now() - created.getTime()) / (1000 * 60 * 60 * 24);
226
+ hfMeta = {
227
+ orgAgeDays: Math.round(ageDays),
228
+ repoDownloads: spoofedModel?.downloads ?? 0,
229
+ };
230
+ if (ageDays < 30) {
231
+ tags.push('NEW_ORG');
232
+ }
233
+ }
234
+
235
+ // README clone check
236
+ if (canonicalOrg.org !== org) {
237
+ const readmeSpoof = await fetchReadme(`${API_BASE}/${referencedRepo}/resolve/main/README.md`);
238
+ const readmeCanonical = await fetchReadme(`${API_BASE}/${canonicalOrg.org}/${referencedRepo.split('/')[1]}/resolve/main/README.md`);
239
+
240
+ if (readmeSpoof && readmeCanonical) {
241
+ const fp1 = simhash(readmeSpoof);
242
+ const fp2 = simhash(readmeCanonical);
243
+ const simScore = simhashSimilarity(fp1, fp2);
244
+
245
+ if (simScore >= 0.9) {
246
+ const readmeFinding = {
247
+ id: 'HF_README_CLONE',
248
+ severity: 'high',
249
+ title: 'HuggingFace README clone',
250
+ description: `README of "${referencedRepo}" is highly similar (${(simScore * 100).toFixed(1)}%) to canonical org "${canonicalOrg.org}/${referencedRepo.split('/')[1]}"`,
251
+ evidence: JSON.stringify({
252
+ referencedRepo,
253
+ canonicalOrg: canonicalOrg.org,
254
+ similarityScore: simScore,
255
+ tags: [],
256
+ }),
257
+ referencedRepo,
258
+ canonicalOrg: canonicalOrg.org,
259
+ similarityScore: simScore,
260
+ tags: [],
261
+ ipiClass: 'SUPPLY_CHAIN',
262
+ };
263
+ if (hfMeta) readmeFinding.hfMeta = hfMeta;
264
+ newFindings.push(readmeFinding);
265
+ }
266
+ }
267
+ }
268
+
269
+ // Artifact mismatch check
270
+ if (spoofedModel?.cardData?.library_name && spoofedModel?.siblings) {
271
+ const libName = spoofedModel.cardData.library_name;
272
+ if (HF_ARTIFACT_LIBS.has(libName)) {
273
+ for (const sibling of spoofedModel.siblings) {
274
+ const fn = sibling.rfilename || '';
275
+ if (SUSPICIOUS_EXTENSIONS.test(fn)) {
276
+ const artifactFinding = {
277
+ id: 'HF_ARTIFACT_MISMATCH',
278
+ severity: 'critical',
279
+ title: 'HF artifact mismatch — suspicious binary in model repo',
280
+ description: `Model "${referencedRepo}" declares library "${libName}" but contains suspicious file "${fn}"`,
281
+ evidence: JSON.stringify({
282
+ referencedRepo,
283
+ artifactConflict: { declaredType: libName, suspiciousFilename: fn },
284
+ tags: [],
285
+ }),
286
+ referencedRepo,
287
+ artifactConflict: { declaredType: libName, suspiciousFilename: fn },
288
+ tags: [],
289
+ ipiClass: 'SUPPLY_CHAIN',
290
+ };
291
+ if (hfMeta) artifactFinding.hfMeta = hfMeta;
292
+ newFindings.push(artifactFinding);
293
+ break;
294
+ }
295
+ }
296
+ }
297
+ }
298
+
299
+ // Apply NEW_ORG and POSTINSTALL_FETCH tags to all findings for this repo
300
+ const repoSpoofFindings = spoofFindings.filter(f => f.referencedRepo === referencedRepo);
301
+ for (const sf of repoSpoofFindings) {
302
+ if (tags.length > 0) {
303
+ if (!sf.tags) sf.tags = [];
304
+ for (const t of tags) {
305
+ if (!sf.tags.includes(t)) sf.tags.push(t);
306
+ }
307
+ }
308
+ if (hfMeta) {
309
+ sf.hfMeta = hfMeta;
310
+ }
311
+ }
312
+ for (const nf of newFindings) {
313
+ if (nf.referencedRepo === referencedRepo) {
314
+ if (tags.length > 0) {
315
+ if (!nf.tags) nf.tags = [];
316
+ for (const t of tags) {
317
+ if (!nf.tags.includes(t)) nf.tags.push(t);
318
+ }
319
+ }
320
+ }
321
+ }
322
+ }
323
+
324
+ // POSTINSTALL_FETCH escalation
325
+ if (postinstallFetchFlag) {
326
+ const allStage2Findings = [...spoofFindings, ...newFindings];
327
+ const escalatedRepos = new Set();
328
+ for (const f of allStage2Findings) {
329
+ if (f.referencedRepo) escalatedRepos.add(f.referencedRepo);
330
+ }
331
+ for (const f of allStage2Findings) {
332
+ if (escalatedRepos.has(f.referencedRepo)) {
333
+ if (severityIndex(f.severity) < severityIndex('critical')) {
334
+ f.severity = 'critical';
335
+ }
336
+ if (!f.tags) f.tags = [];
337
+ if (!f.tags.includes('POSTINSTALL_ESCALATED')) {
338
+ f.tags.push('POSTINSTALL_ESCALATED');
339
+ }
340
+ }
341
+ }
342
+ }
343
+
344
+ return newFindings;
345
+ }
346
+
347
+ export async function scan(pkgJson, files = [], registryMeta = null, allFiles = null) {
348
+ const { tuples, postinstallFetchFlag } = extractHFTuples(pkgJson, allFiles || files);
349
+
350
+ if (tuples.size === 0) return [];
351
+
352
+ // Stage 1: org spoof detection (local only)
353
+ const spoofFindings = [];
354
+ const orgsToCheck = []; // [referencedRepo, { org, canonicalOrg, similarityScore, finding }]
355
+
356
+ for (const tuple of tuples) {
357
+ const parts = tuple.split('/');
358
+ if (parts.length < 2) continue;
359
+ const org = parts[0];
360
+
361
+ const canonicalOrg = findClosestOrg(org);
362
+ if (!canonicalOrg.org) continue;
363
+ if (org.toLowerCase() === canonicalOrg.org.toLowerCase()) continue;
364
+
365
+ const finding = buildHFOrgSpoofFinding(tuple, org, canonicalOrg, canonicalOrg.score, postinstallFetchFlag, []);
366
+ spoofFindings.push(finding);
367
+ orgsToCheck.push([tuple, { org, canonicalOrg, similarityScore: canonicalOrg.score, finding }]);
368
+ }
369
+
370
+ if (spoofFindings.length === 0) return [];
371
+
372
+ // Stage 2: network checks
373
+ const stage2Findings = await runStage2(spoofFindings, orgsToCheck, postinstallFetchFlag);
374
+
375
+ // Deduplicate POSTINSTALL_ESCALATED tag in evidence
376
+ for (const f of [...spoofFindings, ...stage2Findings]) {
377
+ if (f.tags && f.tags.length > 0) {
378
+ try {
379
+ const ev = JSON.parse(f.evidence);
380
+ ev.tags = [...f.tags];
381
+ f.evidence = JSON.stringify(ev);
382
+ } catch {
383
+ // evidence wasn't JSON, leave as-is
384
+ }
385
+ }
386
+ }
387
+
388
+ return [...spoofFindings, ...stage2Findings];
389
+ }
390
+
391
+ export function clearCache() {
392
+ _cache.clear();
393
+ _lastFetchTime = 0;
394
+ }
395
+
396
+ export { KNOWN_HF_ORGS, jaroWinkler, simhash, simhashSimilarity };
@@ -0,0 +1,44 @@
1
+ export function jaroWinkler(s1, s2) {
2
+ if (s1 === s2) return 1;
3
+ const len1 = s1.length, len2 = s2.length;
4
+ if (len1 === 0 || len2 === 0) return 0;
5
+
6
+ const matchDist = Math.floor(Math.max(len1, len2) / 2) - 1;
7
+ const matches1 = new Array(len1).fill(false);
8
+ const matches2 = new Array(len2).fill(false);
9
+ let matches = 0;
10
+
11
+ for (let i = 0; i < len1; i++) {
12
+ const start = Math.max(0, i - matchDist);
13
+ const end = Math.min(len2, i + matchDist + 1);
14
+ for (let j = start; j < end; j++) {
15
+ if (matches2[j]) continue;
16
+ if (s1[i] !== s2[j]) continue;
17
+ matches1[i] = true;
18
+ matches2[j] = true;
19
+ matches++;
20
+ break;
21
+ }
22
+ }
23
+
24
+ if (matches === 0) return 0;
25
+
26
+ let transpositions = 0, k = 0;
27
+ for (let i = 0; i < len1; i++) {
28
+ if (!matches1[i]) continue;
29
+ while (!matches2[k]) k++;
30
+ if (s1[i] !== s2[k]) transpositions++;
31
+ k++;
32
+ }
33
+
34
+ const jaro = (matches / len1 + matches / len2 + (matches - transpositions / 2) / matches) / 3;
35
+
36
+ let prefix = 0;
37
+ const maxPrefix = Math.min(4, len1, len2);
38
+ for (let i = 0; i < maxPrefix; i++) {
39
+ if (s1[i] === s2[i]) prefix++;
40
+ else break;
41
+ }
42
+
43
+ return jaro + prefix * 0.1 * (1 - jaro);
44
+ }
@@ -0,0 +1,5 @@
1
+ export const KNOWN_HF_ORGS = [
2
+ 'openai', 'meta-llama', 'mistralai', 'google', 'microsoft',
3
+ 'stabilityai', 'EleutherAI', 'huggingface', 'tiiuae', 'cohere',
4
+ 'anthropic', 'deepseek-ai', 'Qwen', 'NousResearch', 'teknium',
5
+ ];
@@ -0,0 +1,46 @@
1
+ function hashToken(str) {
2
+ let hash = 5381;
3
+ for (let i = 0; i < str.length; i++) {
4
+ hash = ((hash << 5) + hash) + str.charCodeAt(i);
5
+ hash = hash & hash;
6
+ }
7
+ return hash >>> 0;
8
+ }
9
+
10
+ export function simhash(text) {
11
+ const v = new Array(64).fill(0);
12
+ const tokens = text.toLowerCase().split(/\s+/).filter(Boolean);
13
+
14
+ for (const token of tokens) {
15
+ const h = hashToken(token);
16
+ for (let i = 0; i < 64; i++) {
17
+ if ((h >> i) & 1) {
18
+ v[i] += 1;
19
+ } else {
20
+ v[i] -= 1;
21
+ }
22
+ }
23
+ }
24
+
25
+ let fingerprint = 0n;
26
+ for (let i = 0; i < 64; i++) {
27
+ if (v[i] > 0) {
28
+ fingerprint |= (1n << BigInt(i));
29
+ }
30
+ }
31
+ return fingerprint;
32
+ }
33
+
34
+ export function hammingDistance(a, b) {
35
+ let xor = a ^ b;
36
+ let count = 0;
37
+ while (xor > 0n) {
38
+ count += Number(xor & 1n);
39
+ xor >>= 1n;
40
+ }
41
+ return count;
42
+ }
43
+
44
+ export function similarity(a, b) {
45
+ return 1 - hammingDistance(a, b) / 64;
46
+ }
@@ -9,8 +9,10 @@ import * as atk008 from './atk-008-tarball-tamper.js';
9
9
  import * as atk009 from './atk-009-dormant-trigger.js';
10
10
  import * as atk010 from './atk-010-sandbox-evasion.js';
11
11
  import * as atk011 from './atk-011-transitive-prop.js';
12
+ import { scanAll as megalodonScan } from './megalodon/index.js';
13
+ import { scan as hfScan } from './hf-impersonation/index.js';
12
14
 
13
- export async function runAll(pkgJson, files = []) {
15
+ export async function runAll(pkgJson, files = [], registryMeta = null, allFiles = null) {
14
16
  const findings = [];
15
17
  findings.push(...await atk001.scan(pkgJson, files));
16
18
  findings.push(...await atk002.scan(pkgJson, files));
@@ -23,5 +25,7 @@ export async function runAll(pkgJson, files = []) {
23
25
  findings.push(...await atk009.scan(pkgJson, files));
24
26
  findings.push(...await atk010.scan(pkgJson, files));
25
27
  findings.push(...await atk011.scan(pkgJson, files));
28
+ findings.push(...await megalodonScan(pkgJson, allFiles || files, registryMeta));
29
+ findings.push(...await hfScan(pkgJson, files, registryMeta, allFiles || files));
26
30
  return findings.sort((a, b) => b.severity.localeCompare(a.severity));
27
31
  }
@@ -0,0 +1,147 @@
1
+ import { MegalodonSignal } from './types.js';
2
+ import yaml from 'js-yaml';
3
+
4
+ const C2_EXFIL_RE = /curl\s+.*?https?:\/\/(?!github\.com|githubusercontent\.com|raw\.githubusercontent\.com)[^\s'"]+/i;
5
+ const SECRETS_REF_RE = /\$\{\{?\s*secrets\.\w+/;
6
+ const B64_DECODE_CHAIN_RE = /base64\s+-d\s*[|>]\s*(ba)?sh/;
7
+
8
+ function isWorkflowFile(f) {
9
+ const p = f.path.replace(/\\/g, '/');
10
+ return /\.github\/workflows\/.+\.(yml|yaml)$/i.test(p);
11
+ }
12
+
13
+ function countExecutableLines(text) {
14
+ return text.split('\n').filter(l => l.trim() && !l.trim().startsWith('#')).length;
15
+ }
16
+
17
+ function extractRunBlocks(parsed) {
18
+ const runs = [];
19
+ if (!parsed || typeof parsed !== 'object') return runs;
20
+
21
+ const walk = (obj) => {
22
+ if (!obj || typeof obj !== 'object') return;
23
+ if (Array.isArray(obj)) { obj.forEach(walk); return; }
24
+ for (const [k, v] of Object.entries(obj)) {
25
+ if (k === 'run' && typeof v === 'string') {
26
+ runs.push(v);
27
+ }
28
+ if (k === 'env' && typeof v === 'object' && v !== null) {
29
+ runs.push({ _env: v });
30
+ }
31
+ walk(v);
32
+ }
33
+ };
34
+ walk(parsed);
35
+ return runs;
36
+ }
37
+
38
+ function extractRunBlocksRaw(text) {
39
+ const runs = [];
40
+ const runMatch = text.match(/run:\s*[|>]\s*\n(\s{2,}.*(?:\n\s{2,}.*)*)/g);
41
+ if (runMatch) runs.push(...runMatch.map(m => m.replace(/^run:\s*[|>]\s*\n/, '')));
42
+
43
+ const inlineRe = /run:\s*['"](.+?)['"]\s*$/gm;
44
+ let m;
45
+ while ((m = inlineRe.exec(text)) !== null) runs.push(m[1]);
46
+
47
+ const envRe = /env:\s*\n((?:\s{2,}\w+:\s*.+\n?)*)/g;
48
+ let em;
49
+ while ((em = envRe.exec(text)) !== null) runs.push({ _env: em[1] });
50
+ return runs;
51
+ }
52
+
53
+ function runInStepHasBoth(step, signal) {
54
+ const runVal = step.run;
55
+ const envVals = step.env ? Object.values(step.env).filter(v => typeof v === 'string').join(' ') : '';
56
+ const combined = typeof runVal === 'string' ? `${runVal} ${envVals}` : '';
57
+
58
+ if (signal === 'exfil') {
59
+ return C2_EXFIL_RE.test(combined) && SECRETS_REF_RE.test(combined);
60
+ }
61
+ if (signal === 'decode') {
62
+ return B64_DECODE_CHAIN_RE.test(combined);
63
+ }
64
+ return false;
65
+ }
66
+
67
+ export async function scan(allFiles) {
68
+ const evidence = [];
69
+ const workflowFiles = allFiles.filter(isWorkflowFile);
70
+
71
+ for (const f of workflowFiles) {
72
+ if (f.content.length > 512 * 1024) continue;
73
+
74
+ let parsed = null;
75
+ let parseError = null;
76
+ try {
77
+ parsed = yaml.load(f.content);
78
+ } catch (e) {
79
+ parseError = e;
80
+ }
81
+
82
+ const rawRunBlocks = parsed ? extractRunBlocks(parsed) : extractRunBlocksRaw(f.content);
83
+ const runStrings = rawRunBlocks.filter(r => typeof r === 'string');
84
+ const envBlocks = rawRunBlocks.filter(r => typeof r === 'object' && r._env);
85
+
86
+ let exfilTriggered = false;
87
+ let decodeTriggered = false;
88
+
89
+ for (const runStr of runStrings) {
90
+ if (!exfilTriggered && C2_EXFIL_RE.test(runStr) && SECRETS_REF_RE.test(runStr)) {
91
+ exfilTriggered = true;
92
+ evidence.push({
93
+ signal: MegalodonSignal.WORKFLOW_C2_EXFIL,
94
+ file: f.path,
95
+ excerpt: runStr.slice(0, 120),
96
+ detail: 'C2 outbound call co-occurs with credentials reference in run block',
97
+ });
98
+ }
99
+
100
+ if (!decodeTriggered && B64_DECODE_CHAIN_RE.test(runStr)) {
101
+ decodeTriggered = true;
102
+ evidence.push({
103
+ signal: MegalodonSignal.WORKFLOW_DECODE_CHAIN,
104
+ file: f.path,
105
+ excerpt: runStr.slice(0, 120),
106
+ detail: 'Base64 decode pipe to shell — obfuscated payload execution',
107
+ });
108
+ }
109
+ }
110
+
111
+ if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
112
+ const steps = parsed.jobs ? Object.values(parsed.jobs).flatMap(j => j.steps || []) : [];
113
+ for (const step of steps) {
114
+ if (!exfilTriggered && runInStepHasBoth(step, 'exfil')) {
115
+ exfilTriggered = true;
116
+ const runVal = step.run || '';
117
+ evidence.push({
118
+ signal: MegalodonSignal.WORKFLOW_C2_EXFIL,
119
+ file: f.path,
120
+ excerpt: runVal.slice(0, 120),
121
+ detail: 'C2 outbound call co-occurs with secrets reference in same step',
122
+ });
123
+ }
124
+ if (!decodeTriggered && runInStepHasBoth(step, 'decode')) {
125
+ decodeTriggered = true;
126
+ const runVal = step.run || '';
127
+ evidence.push({
128
+ signal: MegalodonSignal.WORKFLOW_DECODE_CHAIN,
129
+ file: f.path,
130
+ excerpt: runVal.slice(0, 120),
131
+ detail: 'Base64 decode pipe to shell — obfuscated payload execution',
132
+ });
133
+ }
134
+ }
135
+ }
136
+
137
+ const lineCount = countExecutableLines(f.content);
138
+ if ((exfilTriggered || decodeTriggered) && lineCount >= 100 && lineCount <= 120) {
139
+ const found = evidence.find(e => e.signal === MegalodonSignal.WORKFLOW_C2_EXFIL || e.signal === MegalodonSignal.WORKFLOW_DECODE_CHAIN);
140
+ if (found) {
141
+ found.detail += ` | Matches ${lineCount}-line Megalodon payload footprint`;
142
+ }
143
+ }
144
+ }
145
+
146
+ return evidence;
147
+ }