muaddib-scanner 2.11.38 → 2.11.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/{self-scan-v2.11.38.json → self-scan-v2.11.40.json} +34 -7
- package/src/ml/jsonl-writer.js +64 -6
- package/src/monitor/classify.js +8 -2
- package/src/monitor/ingestion.js +29 -9
- package/src/monitor/queue.js +29 -4
- package/src/monitor/state.js +22 -0
- package/src/monitor/webhook.js +29 -4
- package/src/response/playbooks.js +10 -0
- package/src/rules/index.js +15 -0
- package/src/scanner/ai-config.js +32 -3
- package/src/scanner/obfuscation.js +1 -48
- package/src/shared/download.js +97 -6
- package/src/shared/unicode-invisibles.js +164 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"target": "node_modules",
|
|
3
|
-
"timestamp": "2026-05-
|
|
3
|
+
"timestamp": "2026-05-25T09:38:49.363Z",
|
|
4
4
|
"threats": [
|
|
5
5
|
{
|
|
6
6
|
"type": "string_mutation_obfuscation",
|
|
@@ -870,6 +870,27 @@
|
|
|
870
870
|
"playbook": "CRITIQUE: Execution de commande shell dangereuse detectee. Isoler la machine. Verifier si la commande a ete executee.",
|
|
871
871
|
"points": 3
|
|
872
872
|
},
|
|
873
|
+
{
|
|
874
|
+
"type": "unicode_invisible_injection",
|
|
875
|
+
"severity": "CRITICAL",
|
|
876
|
+
"message": "10 invisible Unicode characters detected (zero-width, variation selectors, tag chars). Possible hidden payload encoded via invisible codepoints.",
|
|
877
|
+
"file": "iconv-lite/encodings/sbcs-data-generated.js",
|
|
878
|
+
"count": 1,
|
|
879
|
+
"reductions": [],
|
|
880
|
+
"originalSeverity": "CRITICAL",
|
|
881
|
+
"confidenceTier": "medium",
|
|
882
|
+
"rule_id": "MUADDIB-OBF-003",
|
|
883
|
+
"rule_name": "Unicode Invisible Character Injection",
|
|
884
|
+
"confidence": "high",
|
|
885
|
+
"domain": "malware",
|
|
886
|
+
"references": [
|
|
887
|
+
"https://www.aikido.dev/blog/glassworm-returns-unicode-attack-github-npm-vscode",
|
|
888
|
+
"https://attack.mitre.org/techniques/T1027/"
|
|
889
|
+
],
|
|
890
|
+
"mitre": "T1027",
|
|
891
|
+
"playbook": "CRITIQUE: Caracteres Unicode invisibles detectes (zero-width, variation selectors). Technique GlassWorm: du code malveillant est encode via des variation selectors invisibles dans les editeurs. Analyser le fichier avec un editeur hexa. Supprimer le package immediatement. Verifier les autres fichiers du projet pour des injections similaires.",
|
|
892
|
+
"points": 25
|
|
893
|
+
},
|
|
873
894
|
{
|
|
874
895
|
"type": "high_entropy_string",
|
|
875
896
|
"severity": "LOW",
|
|
@@ -1107,17 +1128,17 @@
|
|
|
1107
1128
|
],
|
|
1108
1129
|
"python": null,
|
|
1109
1130
|
"summary": {
|
|
1110
|
-
"total":
|
|
1111
|
-
"critical":
|
|
1131
|
+
"total": 52,
|
|
1132
|
+
"critical": 3,
|
|
1112
1133
|
"high": 6,
|
|
1113
1134
|
"medium": 28,
|
|
1114
1135
|
"low": 15,
|
|
1115
1136
|
"riskScore": 35,
|
|
1116
1137
|
"riskLevel": "MEDIUM",
|
|
1117
1138
|
"globalRiskScore": 100,
|
|
1118
|
-
"maxFileScore":
|
|
1139
|
+
"maxFileScore": 26,
|
|
1119
1140
|
"packageScore": 1,
|
|
1120
|
-
"mostSuspiciousFile": "
|
|
1141
|
+
"mostSuspiciousFile": "iconv-lite/encodings/sbcs-data-generated.js",
|
|
1121
1142
|
"fileScores": {
|
|
1122
1143
|
"esquery/parser.js": 5,
|
|
1123
1144
|
"ajv/lib/ajv.js": 25,
|
|
@@ -1133,7 +1154,7 @@
|
|
|
1133
1154
|
"eslint/lib/config/config-loader.js": 11,
|
|
1134
1155
|
"eslint/lib/eslint/eslint-helpers.js": 25,
|
|
1135
1156
|
"eslint/lib/eslint/eslint.js": 13,
|
|
1136
|
-
"iconv-lite/encodings/sbcs-data-generated.js":
|
|
1157
|
+
"iconv-lite/encodings/sbcs-data-generated.js": 26,
|
|
1137
1158
|
"iconv-lite/encodings/sbcs-data.js": 1,
|
|
1138
1159
|
"ajv/lib/compile/formats.js": 1
|
|
1139
1160
|
},
|
|
@@ -1169,6 +1190,12 @@
|
|
|
1169
1190
|
"points": 25,
|
|
1170
1191
|
"reason": "Dynamic import() with computed URL argument — remote code loading from dynamically constructed URL."
|
|
1171
1192
|
},
|
|
1193
|
+
{
|
|
1194
|
+
"rule": "MUADDIB-OBF-003",
|
|
1195
|
+
"type": "unicode_invisible_injection",
|
|
1196
|
+
"points": 25,
|
|
1197
|
+
"reason": "10 invisible Unicode characters detected (zero-width, variation selectors, tag chars). Possible hidden payload encoded via invisible codepoints."
|
|
1198
|
+
},
|
|
1172
1199
|
{
|
|
1173
1200
|
"rule": "MUADDIB-AST-006",
|
|
1174
1201
|
"type": "dynamic_require",
|
|
@@ -1461,7 +1488,7 @@
|
|
|
1461
1488
|
"tierCounts": {
|
|
1462
1489
|
"verified": 0,
|
|
1463
1490
|
"high": 0,
|
|
1464
|
-
"medium":
|
|
1491
|
+
"medium": 10,
|
|
1465
1492
|
"low": 42
|
|
1466
1493
|
},
|
|
1467
1494
|
"perceivedFlagged": 0
|
package/src/ml/jsonl-writer.js
CHANGED
|
@@ -18,12 +18,19 @@ const DEFAULT_TRAINING_FILE = path.join(__dirname, '..', '..', 'data', 'ml-train
|
|
|
18
18
|
let TRAINING_FILE = DEFAULT_TRAINING_FILE;
|
|
19
19
|
const MAX_JSONL_SIZE = 100 * 1024 * 1024; // 100MB rotation threshold
|
|
20
20
|
|
|
21
|
+
// In-memory line counter. null = needs recompute (cold boot, file rewrite, or
|
|
22
|
+
// path swap). Maintained incrementally by appendRecord and invalidated by
|
|
23
|
+
// relabelRecords and setTrainingFile. Prior to this cache, getStats read the
|
|
24
|
+
// entire JSONL into RAM on every daily report (72MB allocation × ~30K records).
|
|
25
|
+
let _cachedLineCount = null;
|
|
26
|
+
|
|
21
27
|
/**
|
|
22
28
|
* Override the training file path (for testing).
|
|
23
29
|
* @param {string} filePath - new file path
|
|
24
30
|
*/
|
|
25
31
|
function setTrainingFile(filePath) {
|
|
26
32
|
TRAINING_FILE = filePath;
|
|
33
|
+
_cachedLineCount = null; // different file → recompute on next getStats
|
|
27
34
|
}
|
|
28
35
|
|
|
29
36
|
/**
|
|
@@ -31,6 +38,7 @@ function setTrainingFile(filePath) {
|
|
|
31
38
|
*/
|
|
32
39
|
function resetTrainingFile() {
|
|
33
40
|
TRAINING_FILE = DEFAULT_TRAINING_FILE;
|
|
41
|
+
_cachedLineCount = null;
|
|
34
42
|
}
|
|
35
43
|
|
|
36
44
|
/**
|
|
@@ -49,6 +57,7 @@ function appendRecord(record) {
|
|
|
49
57
|
|
|
50
58
|
const line = JSON.stringify(record) + '\n';
|
|
51
59
|
fs.appendFileSync(TRAINING_FILE, line, 'utf8');
|
|
60
|
+
if (_cachedLineCount !== null) _cachedLineCount++;
|
|
52
61
|
} catch (err) {
|
|
53
62
|
// Non-fatal: JSONL export failure should never crash the monitor
|
|
54
63
|
// Log permission errors so they are visible in journalctl (was silent before v2.10.27)
|
|
@@ -73,6 +82,7 @@ function maybeRotate() {
|
|
|
73
82
|
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
74
83
|
const rotatedName = TRAINING_FILE.replace('.jsonl', `-${timestamp}.jsonl`);
|
|
75
84
|
fs.renameSync(TRAINING_FILE, rotatedName);
|
|
85
|
+
_cachedLineCount = 0; // fresh file starts empty
|
|
76
86
|
console.log(`[ML] Rotated training file → ${path.basename(rotatedName)} (${(stat.size / 1024 / 1024).toFixed(1)}MB)`);
|
|
77
87
|
} catch (err) {
|
|
78
88
|
console.error(`[ML] Rotation failed: ${err.message}`);
|
|
@@ -107,25 +117,71 @@ function readRecords() {
|
|
|
107
117
|
}
|
|
108
118
|
|
|
109
119
|
/**
|
|
110
|
-
*
|
|
120
|
+
* Stream-count newlines in a file using 64KB chunks. Counts non-empty
|
|
121
|
+
* logical records: each `\n`-terminated line that contains at least one
|
|
122
|
+
* non-whitespace byte. Matches the semantics of the old split-based count
|
|
123
|
+
* while avoiding the full-file readFileSync.
|
|
124
|
+
*
|
|
125
|
+
* @param {string} filePath
|
|
126
|
+
* @returns {number}
|
|
127
|
+
*/
|
|
128
|
+
function countLinesStreaming(filePath) {
|
|
129
|
+
const BUFFER_SIZE = 64 * 1024;
|
|
130
|
+
let fd;
|
|
131
|
+
try {
|
|
132
|
+
fd = fs.openSync(filePath, 'r');
|
|
133
|
+
} catch {
|
|
134
|
+
return 0;
|
|
135
|
+
}
|
|
136
|
+
try {
|
|
137
|
+
const buf = Buffer.alloc(BUFFER_SIZE);
|
|
138
|
+
let count = 0;
|
|
139
|
+
let sawContent = false;
|
|
140
|
+
let bytesRead;
|
|
141
|
+
while ((bytesRead = fs.readSync(fd, buf, 0, BUFFER_SIZE, null)) > 0) {
|
|
142
|
+
for (let i = 0; i < bytesRead; i++) {
|
|
143
|
+
const b = buf[i];
|
|
144
|
+
if (b === 0x0A) { // '\n'
|
|
145
|
+
if (sawContent) count++;
|
|
146
|
+
sawContent = false;
|
|
147
|
+
} else if (b !== 0x20 && b !== 0x09 && b !== 0x0D) {
|
|
148
|
+
// any non-whitespace byte (space, tab, CR are still whitespace)
|
|
149
|
+
sawContent = true;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
if (sawContent) count++; // trailing record without final newline
|
|
154
|
+
return count;
|
|
155
|
+
} finally {
|
|
156
|
+
try { fs.closeSync(fd); } catch {}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Get stats about the current JSONL file. Uses an in-memory line counter
|
|
162
|
+
* that is maintained incrementally by appendRecord and invalidated by
|
|
163
|
+
* rewrite operations — so getStats is O(1) on the hot path of the daily
|
|
164
|
+
* report (previously O(file size) via readFileSync on a 72MB+ file).
|
|
165
|
+
*
|
|
111
166
|
* @returns {{ recordCount: number, fileSizeBytes: number, fileSizeMB: string }}
|
|
112
167
|
*/
|
|
113
168
|
function getStats() {
|
|
114
169
|
try {
|
|
115
170
|
if (!fs.existsSync(TRAINING_FILE)) {
|
|
171
|
+
_cachedLineCount = 0;
|
|
116
172
|
return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
|
|
117
173
|
}
|
|
118
174
|
const stat = fs.statSync(TRAINING_FILE);
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
175
|
+
if (_cachedLineCount === null) {
|
|
176
|
+
_cachedLineCount = countLinesStreaming(TRAINING_FILE);
|
|
177
|
+
}
|
|
122
178
|
return {
|
|
123
|
-
recordCount:
|
|
179
|
+
recordCount: _cachedLineCount,
|
|
124
180
|
fileSizeBytes: stat.size,
|
|
125
181
|
fileSizeMB: (stat.size / 1024 / 1024).toFixed(1)
|
|
126
182
|
};
|
|
127
183
|
} catch {
|
|
128
|
-
return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
|
|
184
|
+
return { recordCount: _cachedLineCount || 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
|
|
129
185
|
}
|
|
130
186
|
}
|
|
131
187
|
|
|
@@ -183,6 +239,8 @@ function relabelRecords(packageName, newLabel, sandboxFindingCount, manualReview
|
|
|
183
239
|
|
|
184
240
|
if (updated > 0) {
|
|
185
241
|
fs.writeFileSync(TRAINING_FILE, newLines.join('\n'), 'utf8');
|
|
242
|
+
// File was rewritten — line count cache must be recomputed on next read.
|
|
243
|
+
_cachedLineCount = null;
|
|
186
244
|
console.log(`[ML] Relabeled ${updated} records for ${packageName} → ${newLabel}`);
|
|
187
245
|
}
|
|
188
246
|
return updated;
|
package/src/monitor/classify.js
CHANGED
|
@@ -225,11 +225,15 @@ function isSuspectClassification(result) {
|
|
|
225
225
|
/**
|
|
226
226
|
* Classify an error into a category for the daily report breakdown.
|
|
227
227
|
* @param {Error} err
|
|
228
|
-
* @returns {'too_large'|'tar_failed'|'http_error'|'static_timeout'|'timeout'|'other'}
|
|
228
|
+
* @returns {'too_large'|'tar_failed'|'archive_failed'|'unsupported_format'|'http_error'|'static_timeout'|'timeout'|'other'}
|
|
229
229
|
*/
|
|
230
230
|
function classifyError(err) {
|
|
231
231
|
const msg = (err && err.message) || '';
|
|
232
|
-
if (/too large|tarball too large
|
|
232
|
+
if (/too large|tarball too large|exceeds \d+/i.test(msg)) return 'too_large';
|
|
233
|
+
// Wheel/zip extraction failures must NOT be lumped with tar failures —
|
|
234
|
+
// they were the dominant noise before adm-zip dispatch.
|
|
235
|
+
if (/unsupported archive format/i.test(msg)) return 'unsupported_format';
|
|
236
|
+
if (/zip[\s_-]|wheel|whl\b/i.test(msg)) return 'archive_failed';
|
|
233
237
|
if (/tar\b|extract/i.test(msg)) return 'tar_failed';
|
|
234
238
|
if (/HTTP [45]\d\d|HTTP \d{3}/i.test(msg)) return 'http_error';
|
|
235
239
|
if (/static scan timeout/i.test(msg)) return 'static_timeout';
|
|
@@ -257,6 +261,8 @@ function formatErrorBreakdown(total, byType) {
|
|
|
257
261
|
const parts = [];
|
|
258
262
|
if (byType.http_error > 0) parts.push(`HTTP: ${byType.http_error}`);
|
|
259
263
|
if (byType.tar_failed > 0) parts.push(`tar: ${byType.tar_failed}`);
|
|
264
|
+
if (byType.archive_failed > 0) parts.push(`zip: ${byType.archive_failed}`);
|
|
265
|
+
if (byType.unsupported_format > 0) parts.push(`unsupported: ${byType.unsupported_format}`);
|
|
260
266
|
if (byType.too_large > 0) parts.push(`too large: ${byType.too_large}`);
|
|
261
267
|
if (byType.timeout > 0) parts.push(`timeout: ${byType.timeout}`);
|
|
262
268
|
if (byType.static_timeout > 0) parts.push(`static: ${byType.static_timeout}`);
|
package/src/monitor/ingestion.js
CHANGED
|
@@ -46,7 +46,8 @@ let consecutivePollErrors = 0;
|
|
|
46
46
|
// `ingestion._deps.httpsPost = fakePost` and have it take effect inside
|
|
47
47
|
// pollPyPIChangelog. Kept tiny on purpose — only network I/O lives here.
|
|
48
48
|
const _deps = {
|
|
49
|
-
httpsPost: null // populated below once httpsPost is defined
|
|
49
|
+
httpsPost: null, // populated below once httpsPost is defined
|
|
50
|
+
httpsGet: null // populated below; used by npm pollers so tests can stub
|
|
50
51
|
};
|
|
51
52
|
|
|
52
53
|
function getConsecutivePollErrors() {
|
|
@@ -131,6 +132,7 @@ function httpsPost(url, body, headers = {}, timeoutMs = 30_000) {
|
|
|
131
132
|
}
|
|
132
133
|
|
|
133
134
|
_deps.httpsPost = httpsPost;
|
|
135
|
+
_deps.httpsGet = httpsGet;
|
|
134
136
|
|
|
135
137
|
async function getWeeklyDownloads(packageName) {
|
|
136
138
|
const cached = downloadsCache.get(packageName);
|
|
@@ -162,7 +164,7 @@ async function getPyPITarballUrl(packageName, packageVersion = '') {
|
|
|
162
164
|
const url = packageVersion
|
|
163
165
|
? `https://pypi.org/pypi/${encodeURIComponent(packageName)}/${encodeURIComponent(packageVersion)}/json`
|
|
164
166
|
: `https://pypi.org/pypi/${encodeURIComponent(packageName)}/json`;
|
|
165
|
-
const body = await httpsGet(url);
|
|
167
|
+
const body = await _deps.httpsGet(url);
|
|
166
168
|
let data;
|
|
167
169
|
try {
|
|
168
170
|
data = JSON.parse(body);
|
|
@@ -177,8 +179,11 @@ async function getPyPITarballUrl(packageName, packageVersion = '') {
|
|
|
177
179
|
// Fallback: any .tar.gz
|
|
178
180
|
const tarGz = urls.find(u => u.url && u.url.endsWith('.tar.gz'));
|
|
179
181
|
if (tarGz) return { url: tarGz.url, version };
|
|
180
|
-
// Fallback:
|
|
181
|
-
|
|
182
|
+
// Fallback: wheel (.whl) — extracted via adm-zip in queue.js, not tar.
|
|
183
|
+
// Legacy .egg / .tar.bz2 / .exe installers intentionally NOT returned —
|
|
184
|
+
// they were the cause of ~2773 tar_failed/day before this fix.
|
|
185
|
+
const wheel = urls.find(u => u.url && (u.url.endsWith('.whl') || u.url.endsWith('.zip')));
|
|
186
|
+
if (wheel) return { url: wheel.url, version };
|
|
182
187
|
return { url: null, version };
|
|
183
188
|
}
|
|
184
189
|
|
|
@@ -405,7 +410,7 @@ async function pollNpmChanges(state, scanQueue, stats) {
|
|
|
405
410
|
|
|
406
411
|
// First run: initialize to current seq ("now") via root endpoint
|
|
407
412
|
if (lastSeq == null) {
|
|
408
|
-
const infoBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
|
|
413
|
+
const infoBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
|
|
409
414
|
const info = JSON.parse(infoBody);
|
|
410
415
|
const currentSeq = info.update_seq;
|
|
411
416
|
if (currentSeq == null) {
|
|
@@ -423,13 +428,13 @@ async function pollNpmChanges(state, scanQueue, stats) {
|
|
|
423
428
|
const url = `${CHANGES_STREAM_URL}?since=${lastSeq}&limit=${CHANGES_LIMIT}`;
|
|
424
429
|
let body, data;
|
|
425
430
|
try {
|
|
426
|
-
body = await httpsGet(url, 60000);
|
|
431
|
+
body = await _deps.httpsGet(url, 60000);
|
|
427
432
|
data = JSON.parse(body);
|
|
428
433
|
} catch (fetchErr) {
|
|
429
434
|
// Invalid seq (stale from pre-migration CouchDB) or transient error — re-init to current seq
|
|
430
435
|
console.warn(`[MONITOR] Changes stream fetch failed (${fetchErr.message}) — attempting seq re-init`);
|
|
431
436
|
try {
|
|
432
|
-
const reinitBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
|
|
437
|
+
const reinitBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
|
|
433
438
|
const reinitData = JSON.parse(reinitBody);
|
|
434
439
|
if (reinitData.update_seq != null) {
|
|
435
440
|
state.npmLastSeq = reinitData.update_seq;
|
|
@@ -450,7 +455,7 @@ async function pollNpmChanges(state, scanQueue, stats) {
|
|
|
450
455
|
|
|
451
456
|
// Catch-up protection: if too far behind, skip to current
|
|
452
457
|
if (data.results.length === CHANGES_LIMIT) {
|
|
453
|
-
const currentSeqBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
|
|
458
|
+
const currentSeqBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
|
|
454
459
|
const currentSeqData = JSON.parse(currentSeqBody);
|
|
455
460
|
const currentSeq = currentSeqData.update_seq;
|
|
456
461
|
if (typeof currentSeq === 'number' && typeof data.last_seq === 'number' &&
|
|
@@ -459,12 +464,22 @@ async function pollNpmChanges(state, scanQueue, stats) {
|
|
|
459
464
|
console.warn(`[MONITOR] Changes stream too far behind (${gap} changes) — skipping to current`);
|
|
460
465
|
stats.npmCatchupSkips = (stats.npmCatchupSkips || 0) + 1;
|
|
461
466
|
stats.npmCatchupSkippedSeqs = (stats.npmCatchupSkippedSeqs || 0) + gap;
|
|
467
|
+
// Catch-up gap = events we know happened but chose to skip. They must
|
|
468
|
+
// appear in the coverage denominator so the daily report exposes the
|
|
469
|
+
// gap as low coverage (and the catch-up line explains why).
|
|
470
|
+
stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + gap;
|
|
462
471
|
state.npmLastSeq = currentSeq;
|
|
463
472
|
saveNpmSeq(currentSeq);
|
|
464
473
|
return 0;
|
|
465
474
|
}
|
|
466
475
|
}
|
|
467
476
|
|
|
477
|
+
// IMPORTANT: count raw events BEFORE filtering — otherwise the coverage
|
|
478
|
+
// denominator is biased (matches "events we queued", not "events npm
|
|
479
|
+
// emitted"). The filters below drop _design/self/@types/deleted, but
|
|
480
|
+
// those were still real changes-stream events.
|
|
481
|
+
stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + data.results.length;
|
|
482
|
+
|
|
468
483
|
let queued = 0;
|
|
469
484
|
for (const change of data.results) {
|
|
470
485
|
// Skip deleted packages
|
|
@@ -584,7 +599,7 @@ async function pollNpmRss(state, scanQueue, stats) {
|
|
|
584
599
|
await acquireRegistrySlot();
|
|
585
600
|
let body;
|
|
586
601
|
try {
|
|
587
|
-
body = await httpsGet(url);
|
|
602
|
+
body = await _deps.httpsGet(url);
|
|
588
603
|
} finally {
|
|
589
604
|
releaseRegistrySlot();
|
|
590
605
|
}
|
|
@@ -603,6 +618,11 @@ async function pollNpmRss(state, scanQueue, stats) {
|
|
|
603
618
|
}
|
|
604
619
|
}
|
|
605
620
|
|
|
621
|
+
// Mirror pollNpmChanges: count raw events BEFORE per-package filters
|
|
622
|
+
// so the coverage denominator stays accurate when the changes stream
|
|
623
|
+
// falls back to RSS.
|
|
624
|
+
stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + newPackages.length;
|
|
625
|
+
|
|
606
626
|
for (const name of newPackages) {
|
|
607
627
|
if (name === SELF_PACKAGE_NAME) {
|
|
608
628
|
console.log(`[MONITOR] SKIPPED (self): ${name}`);
|
package/src/monitor/queue.js
CHANGED
|
@@ -13,7 +13,7 @@ const { Worker } = require('worker_threads');
|
|
|
13
13
|
const { run } = require('../index.js');
|
|
14
14
|
const { runSandbox, isDockerAvailable, tryAcquireSandboxSlot, SANDBOX_CONCURRENCY_MAX } = require('../sandbox/index.js');
|
|
15
15
|
const { sendWebhook } = require('../webhook.js');
|
|
16
|
-
const { downloadToFile, extractTarGz, sanitizePackageName } = require('../shared/download.js');
|
|
16
|
+
const { downloadToFile, extractTarGz, extractArchive, sanitizePackageName } = require('../shared/download.js');
|
|
17
17
|
const { MAX_TARBALL_SIZE } = require('../shared/constants.js');
|
|
18
18
|
const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
|
|
19
19
|
const { loadCachedIOCs } = require('../ioc/updater.js');
|
|
@@ -294,10 +294,22 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
294
294
|
if (metaSize > MAX_TARBALL_SIZE) {
|
|
295
295
|
console.log(`[MONITOR] SIZE_REJECT: ${name}@${version} — metadata size ${(metaSize / 1024 / 1024).toFixed(1)}MB exceeds ${(MAX_TARBALL_SIZE / 1024 / 1024).toFixed(0)}MB limit (skipped without download)`);
|
|
296
296
|
stats.scanned++;
|
|
297
|
+
stats.totalTimeMs += Date.now() - startTime;
|
|
297
298
|
return;
|
|
298
299
|
}
|
|
299
300
|
|
|
300
|
-
|
|
301
|
+
// Pick the local filename extension from the URL so adm-zip / tar both
|
|
302
|
+
// read the magic correctly. PyPI wheels arrive as .whl, npm tarballs as
|
|
303
|
+
// .tgz, sdists as .tar.gz. Anything else falls through to .tar.gz
|
|
304
|
+
// (ingestion now returns null for unsupported types, so this branch is
|
|
305
|
+
// a defensive default rather than a real fallback).
|
|
306
|
+
const urlLower = (tarballUrl || '').toLowerCase();
|
|
307
|
+
const isWheel = urlLower.endsWith('.whl') || urlLower.endsWith('.zip');
|
|
308
|
+
const archiveExt = isWheel ? '.whl' : '.tar.gz';
|
|
309
|
+
const tgzPath = path.join(tmpDir, `package${archiveExt}`);
|
|
310
|
+
if (isWheel && ecosystem === 'pypi') {
|
|
311
|
+
stats.pypiWheelsScanned = (stats.pypiWheelsScanned || 0) + 1;
|
|
312
|
+
}
|
|
301
313
|
|
|
302
314
|
// Layer 3: Check tarball cache before downloading
|
|
303
315
|
const cacheKey = tarballCacheKey(name, version);
|
|
@@ -338,6 +350,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
338
350
|
if (fileSize > MAX_TARBALL_SIZE) {
|
|
339
351
|
console.log(`[MONITOR] SKIP: ${name}@${version} — tarball too large (${(fileSize / 1024 / 1024).toFixed(1)}MB)`);
|
|
340
352
|
stats.scanned++;
|
|
353
|
+
stats.totalTimeMs += Date.now() - startTime;
|
|
341
354
|
return;
|
|
342
355
|
}
|
|
343
356
|
|
|
@@ -365,7 +378,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
365
378
|
let bypassQuickScan = false;
|
|
366
379
|
try {
|
|
367
380
|
alreadyExtracted = true;
|
|
368
|
-
extractedDir =
|
|
381
|
+
extractedDir = extractArchive(tgzPath, tmpDir);
|
|
369
382
|
|
|
370
383
|
const [pkgThreats, shellThreats] = await Promise.all([
|
|
371
384
|
scanPackageJson(extractedDir),
|
|
@@ -382,6 +395,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
382
395
|
} else {
|
|
383
396
|
console.log(`[MONITOR] SIZE_SKIP: ${name}@${version} — large package (${(unpackedSize / 1024 / 1024).toFixed(1)}MB, quick scan clean)`);
|
|
384
397
|
stats.scanned++;
|
|
398
|
+
stats.totalTimeMs += Date.now() - startTime;
|
|
385
399
|
stats.clean++;
|
|
386
400
|
updateScanStats('clean');
|
|
387
401
|
return;
|
|
@@ -402,6 +416,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
402
416
|
} else {
|
|
403
417
|
console.log(`[MONITOR] SIZE_SKIP: ${name}@${version} — large package (${(unpackedSize / 1024 / 1024).toFixed(1)}MB, extract failed)`);
|
|
404
418
|
stats.scanned++;
|
|
419
|
+
stats.totalTimeMs += Date.now() - startTime;
|
|
405
420
|
stats.clean++;
|
|
406
421
|
updateScanStats('clean');
|
|
407
422
|
return;
|
|
@@ -411,7 +426,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
|
|
|
411
426
|
}
|
|
412
427
|
|
|
413
428
|
if (!extractedDir) {
|
|
414
|
-
extractedDir =
|
|
429
|
+
extractedDir = extractArchive(tgzPath, tmpDir);
|
|
415
430
|
}
|
|
416
431
|
|
|
417
432
|
// ML Phase 2a: Count JS files and detect test presence for enriched features
|
|
@@ -1169,6 +1184,11 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
|
|
|
1169
1184
|
try {
|
|
1170
1185
|
const pypiInfo = await getPyPITarballUrl(item.name, item.version || '');
|
|
1171
1186
|
if (!pypiInfo.url) {
|
|
1187
|
+
// No sdist / .tar.gz / wheel — likely a legacy egg or msi-only
|
|
1188
|
+
// release. Clean skip: do NOT touch stats.scanned or stats.errors
|
|
1189
|
+
// (those would distort the Commit 1 coverage ratios). The dedicated
|
|
1190
|
+
// pypiSkippedNoArchive counter surfaces volume in the daily report.
|
|
1191
|
+
stats.pypiSkippedNoArchive = (stats.pypiSkippedNoArchive || 0) + 1;
|
|
1172
1192
|
console.log(`[MONITOR] SKIP: ${item.name} — no tarball URL found on PyPI`);
|
|
1173
1193
|
return;
|
|
1174
1194
|
}
|
|
@@ -1205,6 +1225,11 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
|
|
|
1205
1225
|
return;
|
|
1206
1226
|
}
|
|
1207
1227
|
recentlyScanned.add(dedupeKey);
|
|
1228
|
+
// Coverage numerator: one count per unique (ecosystem, name, version) that
|
|
1229
|
+
// reaches a scan attempt. Excludes ATO burst extras that lose the dedup
|
|
1230
|
+
// race, retries, size-cap rejections — those inflate stats.scanned but
|
|
1231
|
+
// would distort the "% of publishes we covered" reading.
|
|
1232
|
+
stats.uniqueScanAttempts = (stats.uniqueScanAttempts || 0) + 1;
|
|
1208
1233
|
|
|
1209
1234
|
// Abort check: if timeout fired during URL resolution or dedup, bail out
|
|
1210
1235
|
if (signal && signal.aborted) return;
|
package/src/monitor/state.js
CHANGED
|
@@ -991,6 +991,8 @@ function loadDailyStats(stats, dailyAlerts) {
|
|
|
991
991
|
if (data.errorsByType) {
|
|
992
992
|
stats.errorsByType.too_large = data.errorsByType.too_large || 0;
|
|
993
993
|
stats.errorsByType.tar_failed = data.errorsByType.tar_failed || 0;
|
|
994
|
+
stats.errorsByType.archive_failed = data.errorsByType.archive_failed || 0;
|
|
995
|
+
stats.errorsByType.unsupported_format = data.errorsByType.unsupported_format || 0;
|
|
994
996
|
stats.errorsByType.http_error = data.errorsByType.http_error || 0;
|
|
995
997
|
stats.errorsByType.timeout = data.errorsByType.timeout || 0;
|
|
996
998
|
stats.errorsByType.static_timeout = data.errorsByType.static_timeout || 0;
|
|
@@ -1001,6 +1003,16 @@ function loadDailyStats(stats, dailyAlerts) {
|
|
|
1001
1003
|
stats.llmAnalyzed = data.llmAnalyzed || 0;
|
|
1002
1004
|
stats.llmSuppressed = data.llmSuppressed || 0;
|
|
1003
1005
|
stats.changesStreamPackages = data.changesStreamPackages || 0;
|
|
1006
|
+
stats.uniqueScanAttempts = data.uniqueScanAttempts || 0;
|
|
1007
|
+
stats.npmPublishEventsSeen = data.npmPublishEventsSeen || 0;
|
|
1008
|
+
stats.pypiChangelogPackages = data.pypiChangelogPackages || 0;
|
|
1009
|
+
stats.pypiChangelogEvents = data.pypiChangelogEvents || 0;
|
|
1010
|
+
stats.npmCatchupSkippedSeqs = data.npmCatchupSkippedSeqs || 0;
|
|
1011
|
+
stats.npmCatchupSkips = data.npmCatchupSkips || 0;
|
|
1012
|
+
stats.pypiCatchupSkippedEvents = data.pypiCatchupSkippedEvents || 0;
|
|
1013
|
+
stats.pypiCatchupSkips = data.pypiCatchupSkips || 0;
|
|
1014
|
+
stats.pypiWheelsScanned = data.pypiWheelsScanned || 0;
|
|
1015
|
+
stats.pypiSkippedNoArchive = data.pypiSkippedNoArchive || 0;
|
|
1004
1016
|
if (Array.isArray(data.dailyAlerts)) {
|
|
1005
1017
|
const restored = data.dailyAlerts.slice(-MAX_DAILY_ALERTS);
|
|
1006
1018
|
dailyAlerts.length = 0;
|
|
@@ -1029,6 +1041,16 @@ function saveDailyStats(stats, dailyAlerts) {
|
|
|
1029
1041
|
llmAnalyzed: stats.llmAnalyzed || 0,
|
|
1030
1042
|
llmSuppressed: stats.llmSuppressed || 0,
|
|
1031
1043
|
changesStreamPackages: stats.changesStreamPackages || 0,
|
|
1044
|
+
uniqueScanAttempts: stats.uniqueScanAttempts || 0,
|
|
1045
|
+
npmPublishEventsSeen: stats.npmPublishEventsSeen || 0,
|
|
1046
|
+
pypiChangelogPackages: stats.pypiChangelogPackages || 0,
|
|
1047
|
+
pypiChangelogEvents: stats.pypiChangelogEvents || 0,
|
|
1048
|
+
npmCatchupSkippedSeqs: stats.npmCatchupSkippedSeqs || 0,
|
|
1049
|
+
npmCatchupSkips: stats.npmCatchupSkips || 0,
|
|
1050
|
+
pypiCatchupSkippedEvents: stats.pypiCatchupSkippedEvents || 0,
|
|
1051
|
+
pypiCatchupSkips: stats.pypiCatchupSkips || 0,
|
|
1052
|
+
pypiWheelsScanned: stats.pypiWheelsScanned || 0,
|
|
1053
|
+
pypiSkippedNoArchive: stats.pypiSkippedNoArchive || 0,
|
|
1032
1054
|
dailyAlerts: dailyAlerts.slice()
|
|
1033
1055
|
};
|
|
1034
1056
|
atomicWriteFileSync(DAILY_STATS_FILE, JSON.stringify(data, null, 2));
|
package/src/monitor/webhook.js
CHANGED
|
@@ -855,11 +855,24 @@ function buildDailyReportEmbed(stats, dailyAlerts) {
|
|
|
855
855
|
const avg = stats.scanned > 0 ? (stats.totalTimeMs / stats.scanned / 1000).toFixed(1) : '0.0';
|
|
856
856
|
|
|
857
857
|
// --- Coverage estimation ---
|
|
858
|
-
//
|
|
859
|
-
|
|
858
|
+
// Numerator: unique (ecosystem, name, version) tuples that reached a scan
|
|
859
|
+
// attempt (post-dedup). Denominator: raw publish events seen on either
|
|
860
|
+
// changes stream BEFORE per-package filtering, plus npm catch-up gaps and
|
|
861
|
+
// PyPI publish events that survived per-(name,version) dedup. This stays
|
|
862
|
+
// bounded near 100% — old "scanned/changesStreamPackages" was racing PyPI
|
|
863
|
+
// scans and ATO burst extras against an npm-only denominator.
|
|
864
|
+
const attempted = stats.uniqueScanAttempts || 0;
|
|
865
|
+
const npmPub = stats.npmPublishEventsSeen || 0;
|
|
866
|
+
const pypiPub = stats.pypiChangelogPackages || 0;
|
|
867
|
+
const published = npmPub + pypiPub;
|
|
868
|
+
const coverageRatio = published > 0 ? (attempted / published * 100).toFixed(0) : '0';
|
|
869
|
+
const catchupSkipped = (stats.npmCatchupSkippedSeqs || 0) + (stats.pypiCatchupSkippedEvents || 0);
|
|
870
|
+
const opsSuffix = catchupSkipped > 0
|
|
871
|
+
? `\nOps: ${stats.scanned} | Catch-up skip: ${catchupSkipped}`
|
|
872
|
+
: `\nOps: ${stats.scanned}`;
|
|
860
873
|
const coverageText = published > 0
|
|
861
|
-
? `${
|
|
862
|
-
: `${
|
|
874
|
+
? `${attempted}/${published} (${coverageRatio}%)${opsSuffix}`
|
|
875
|
+
: `${attempted} attempted${opsSuffix}`;
|
|
863
876
|
|
|
864
877
|
// --- Timeouts ---
|
|
865
878
|
const staticTimeouts = (stats.errorsByType && stats.errorsByType.static_timeout) || 0;
|
|
@@ -1019,6 +1032,8 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
|
|
|
1019
1032
|
stats.errors = 0;
|
|
1020
1033
|
stats.errorsByType.too_large = 0;
|
|
1021
1034
|
stats.errorsByType.tar_failed = 0;
|
|
1035
|
+
stats.errorsByType.archive_failed = 0;
|
|
1036
|
+
stats.errorsByType.unsupported_format = 0;
|
|
1022
1037
|
stats.errorsByType.http_error = 0;
|
|
1023
1038
|
stats.errorsByType.timeout = 0;
|
|
1024
1039
|
stats.errorsByType.static_timeout = 0;
|
|
@@ -1033,6 +1048,16 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
|
|
|
1033
1048
|
// Reset LLM detective internal stats
|
|
1034
1049
|
try { require('../ml/llm-detective.js').resetStats(); } catch {}
|
|
1035
1050
|
stats.changesStreamPackages = 0;
|
|
1051
|
+
stats.uniqueScanAttempts = 0;
|
|
1052
|
+
stats.npmPublishEventsSeen = 0;
|
|
1053
|
+
stats.pypiChangelogPackages = 0;
|
|
1054
|
+
stats.pypiChangelogEvents = 0;
|
|
1055
|
+
stats.npmCatchupSkippedSeqs = 0;
|
|
1056
|
+
stats.npmCatchupSkips = 0;
|
|
1057
|
+
stats.pypiCatchupSkippedEvents = 0;
|
|
1058
|
+
stats.pypiCatchupSkips = 0;
|
|
1059
|
+
stats.pypiWheelsScanned = 0;
|
|
1060
|
+
stats.pypiSkippedNoArchive = 0;
|
|
1036
1061
|
stats.rssFallbackCount = 0;
|
|
1037
1062
|
dailyAlerts.length = 0;
|
|
1038
1063
|
recentlyScanned.clear();
|
|
@@ -399,6 +399,16 @@ const PLAYBOOKS = {
|
|
|
399
399
|
'Technique Shai-Hulud (TeamPCP). Supprimer les fichiers .claude/settings.json ' +
|
|
400
400
|
'et .vscode/tasks.json avant ouverture.',
|
|
401
401
|
|
|
402
|
+
aiconf_unicode_obfuscation:
|
|
403
|
+
'CRITIQUE: Fichier de config d\'agent IA contient des caracteres Unicode invisibles ' +
|
|
404
|
+
'(zero-width, directional override, variation selectors). Technique TrapDoor (mai 2026): ' +
|
|
405
|
+
'l\'attaquant insere des U+200B au milieu de mots-cles pour echapper a la revue humaine ' +
|
|
406
|
+
'et aux regex statiques, tandis que l\'agent IA (Claude, Cursor) lit le contenu normalise ' +
|
|
407
|
+
'et execute le payload cache. NE PAS ouvrir ce projet avec un agent IA. Ouvrir le fichier ' +
|
|
408
|
+
'dans un editeur qui affiche les caracteres invisibles (VS Code: "editor.renderControlCharacters") ' +
|
|
409
|
+
'pour inspecter le contenu reel. Supprimer le fichier ou nettoyer les caracteres invisibles ' +
|
|
410
|
+
'avant toute utilisation. Si deja ouvert avec un agent IA, regenerer tous les secrets touches.',
|
|
411
|
+
|
|
402
412
|
ai_agent_abuse:
|
|
403
413
|
'CRITIQUE: Un agent IA (Claude, Gemini, Q) est invoque avec des flags de bypass de securite ' +
|
|
404
414
|
'(--dangerously-skip-permissions, --yolo, --trust-all-tools). Technique s1ngularity/Nx. ' +
|
package/src/rules/index.js
CHANGED
|
@@ -914,6 +914,21 @@ const RULES = {
|
|
|
914
914
|
],
|
|
915
915
|
mitre: 'T1546'
|
|
916
916
|
},
|
|
917
|
+
aiconf_unicode_obfuscation: {
|
|
918
|
+
id: 'MUADDIB-AICONF-004',
|
|
919
|
+
name: 'Zero-Width Unicode Obfuscation in AI Config',
|
|
920
|
+
severity: 'CRITICAL',
|
|
921
|
+
confidence: 'high',
|
|
922
|
+
domain: 'malware',
|
|
923
|
+
description: 'Fichier de configuration d\'agent IA (.cursorrules, CLAUDE.md, copilot-instructions.md) contient des caracteres Unicode invisibles (zero-width, directional override, variation selectors) qui cachent des instructions a la revue humaine ou cassent des mots-cles pour echapper a la detection regex. Technique TrapDoor (mai 2026): curl|sh interspersee de U+200B passe au travers du regex /curl/ tandis que l\'agent IA execute le payload normalise.',
|
|
924
|
+
references: [
|
|
925
|
+
'https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates',
|
|
926
|
+
'https://www.aikido.dev/blog/glassworm-returns-unicode-attack-github-npm-vscode',
|
|
927
|
+
'https://trojansource.codes/',
|
|
928
|
+
'https://attack.mitre.org/techniques/T1027/'
|
|
929
|
+
],
|
|
930
|
+
mitre: 'T1027.013'
|
|
931
|
+
},
|
|
917
932
|
|
|
918
933
|
require_cache_poison: {
|
|
919
934
|
id: 'MUADDIB-AST-019',
|
package/src/scanner/ai-config.js
CHANGED
|
@@ -18,6 +18,14 @@
|
|
|
18
18
|
|
|
19
19
|
const fs = require('fs');
|
|
20
20
|
const path = require('path');
|
|
21
|
+
const { countInvisibleUnicode, stripInvisibleUnicode } = require('../shared/unicode-invisibles.js');
|
|
22
|
+
|
|
23
|
+
// Threshold above which an AI config file is flagged as ZW-Unicode-obfuscated.
|
|
24
|
+
// Lower than obfuscation.js (10) because .cursorrules / CLAUDE.md should never
|
|
25
|
+
// legitimately contain invisible codepoints — even international content uses
|
|
26
|
+
// only visible chars (CJK, accents, emoji with U+FE0F variation selector are
|
|
27
|
+
// NOT counted by countInvisibleUnicode).
|
|
28
|
+
const AI_CONFIG_ZW_THRESHOLD = 5;
|
|
21
29
|
|
|
22
30
|
// AI agent config files to scan for prompt injection (relative to project root)
|
|
23
31
|
const AI_CONFIG_FILES = [
|
|
@@ -111,7 +119,12 @@ function scanAIConfig(targetPath) {
|
|
|
111
119
|
}
|
|
112
120
|
|
|
113
121
|
const relPath = configFile;
|
|
114
|
-
|
|
122
|
+
// Normalize invisible Unicode BEFORE running regex patterns.
|
|
123
|
+
// Without this, an attacker can split keywords with U+200B (`curl`) to
|
|
124
|
+
// evade /curl\s+/ — the exact TrapDoor (mai 2026) .cursorrules vector.
|
|
125
|
+
const invisibleCount = countInvisibleUnicode(content);
|
|
126
|
+
const normalized = invisibleCount > 0 ? stripInvisibleUnicode(content) : content;
|
|
127
|
+
const fileThreats = analyzeAIConfigFile(normalized, relPath, invisibleCount);
|
|
115
128
|
threats.push(...fileThreats);
|
|
116
129
|
}
|
|
117
130
|
|
|
@@ -218,14 +231,30 @@ function analyzeIDEHookFile(content, relPath) {
|
|
|
218
231
|
}
|
|
219
232
|
|
|
220
233
|
/**
|
|
221
|
-
* Analyze a single AI config file for prompt injection patterns
|
|
234
|
+
* Analyze a single AI config file for prompt injection patterns.
|
|
235
|
+
*
|
|
236
|
+
* @param {string} content - File content, already normalized (invisible Unicode stripped).
|
|
237
|
+
* @param {string} relPath - Relative path of the config file.
|
|
238
|
+
* @param {number} invisibleCount - Number of invisible Unicode codepoints in the original (pre-strip) content.
|
|
222
239
|
*/
|
|
223
|
-
function analyzeAIConfigFile(content, relPath) {
|
|
240
|
+
function analyzeAIConfigFile(content, relPath, invisibleCount) {
|
|
224
241
|
const threats = [];
|
|
225
242
|
let hasShellCommand = false;
|
|
226
243
|
let hasExfiltration = false;
|
|
227
244
|
let hasCredentialAccess = false;
|
|
228
245
|
|
|
246
|
+
// Zero-width / directional Unicode obfuscation (TrapDoor, mai 2026).
|
|
247
|
+
// An attacker can hide instructions or split keywords with U+200B etc. so
|
|
248
|
+
// human reviewers see "harmless" text while the AI agent reads the payload.
|
|
249
|
+
if (invisibleCount >= AI_CONFIG_ZW_THRESHOLD) {
|
|
250
|
+
threats.push({
|
|
251
|
+
type: 'aiconf_unicode_obfuscation',
|
|
252
|
+
severity: 'CRITICAL',
|
|
253
|
+
message: `AI config contains ${invisibleCount} invisible Unicode characters (zero-width / directional / variation selectors) in ${relPath} — content was normalized before pattern matching. Possible hidden instructions or keyword-splitting evasion (TrapDoor pattern).`,
|
|
254
|
+
file: relPath
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
|
|
229
258
|
// Check shell command patterns
|
|
230
259
|
for (const pattern of SHELL_COMMAND_PATTERNS) {
|
|
231
260
|
if (pattern.regex.test(content)) {
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
const fs = require('fs');
|
|
2
2
|
const path = require('path');
|
|
3
3
|
const { findFiles, forEachSafeFile, debugLog } = require('../utils.js');
|
|
4
|
+
const { countInvisibleUnicode } = require('../shared/unicode-invisibles.js');
|
|
4
5
|
|
|
5
6
|
// node_modules NOT excluded: detect obfuscated code in dependencies.
|
|
6
7
|
// dist/build/out/output excluded: bundled output is always flagged as isPackageOutput (LOW)
|
|
@@ -198,52 +199,4 @@ function hasLargeStringArray(content) {
|
|
|
198
199
|
return false;
|
|
199
200
|
}
|
|
200
201
|
|
|
201
|
-
/**
|
|
202
|
-
* Count invisible Unicode codepoints in content (GlassWorm detection).
|
|
203
|
-
* Covers BMP zero-width chars, variation selectors, and supplementary plane
|
|
204
|
-
* tag characters / variation selectors supplement via codePointAt iteration.
|
|
205
|
-
*
|
|
206
|
-
* Codepoints detected:
|
|
207
|
-
* - U+200B, U+200C, U+200D (zero-width space/joiner/non-joiner)
|
|
208
|
-
* - U+FEFF (BOM — only if position > 0; pos 0 is legitimate BOM)
|
|
209
|
-
* - U+2060 (word joiner), U+180E (Mongolian vowel separator)
|
|
210
|
-
* - U+FE00-U+FE0E (variation selectors — excludes U+FE0F emoji presentation selector)
|
|
211
|
-
* - U+E0100-U+E01EF (variation selectors supplement)
|
|
212
|
-
* - U+E0001-U+E007F (tag characters)
|
|
213
|
-
*/
|
|
214
|
-
function countInvisibleUnicode(content) {
|
|
215
|
-
let count = 0;
|
|
216
|
-
for (let i = 0; i < content.length; i++) {
|
|
217
|
-
const cp = content.codePointAt(i);
|
|
218
|
-
// BMP invisible chars
|
|
219
|
-
if (cp === 0x200B || cp === 0x200C || cp === 0x200D ||
|
|
220
|
-
cp === 0x2060 || cp === 0x180E) {
|
|
221
|
-
count++;
|
|
222
|
-
}
|
|
223
|
-
// BOM only suspicious after position 0
|
|
224
|
-
else if (cp === 0xFEFF && i > 0) {
|
|
225
|
-
count++;
|
|
226
|
-
}
|
|
227
|
-
// BMP variation selectors (U+FE00-U+FE0E) — excludes U+FE0F (emoji presentation selector)
|
|
228
|
-
else if (cp >= 0xFE00 && cp <= 0xFE0E) {
|
|
229
|
-
count++;
|
|
230
|
-
}
|
|
231
|
-
// Supplementary plane: variation selectors supplement (U+E0100-U+E01EF)
|
|
232
|
-
else if (cp >= 0xE0100 && cp <= 0xE01EF) {
|
|
233
|
-
count++;
|
|
234
|
-
i++; // skip surrogate pair low half
|
|
235
|
-
}
|
|
236
|
-
// Supplementary plane: tag characters (U+E0001-U+E007F)
|
|
237
|
-
else if (cp >= 0xE0001 && cp <= 0xE007F) {
|
|
238
|
-
count++;
|
|
239
|
-
i++; // skip surrogate pair low half
|
|
240
|
-
}
|
|
241
|
-
// Skip surrogate pair low half for other supplementary chars
|
|
242
|
-
else if (cp > 0xFFFF) {
|
|
243
|
-
i++;
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
return count;
|
|
247
|
-
}
|
|
248
|
-
|
|
249
202
|
module.exports = { detectObfuscation };
|
package/src/shared/download.js
CHANGED
|
@@ -2,6 +2,7 @@ const https = require('https');
|
|
|
2
2
|
const fs = require('fs');
|
|
3
3
|
const path = require('path');
|
|
4
4
|
const { execFileSync } = require('child_process');
|
|
5
|
+
const AdmZip = require('adm-zip');
|
|
5
6
|
const { MAX_TARBALL_SIZE, DOWNLOAD_TIMEOUT } = require('./constants.js');
|
|
6
7
|
|
|
7
8
|
// Allowed redirect domains for tarball downloads (SSRF protection)
|
|
@@ -221,13 +222,30 @@ function downloadToFile(url, destPath, timeoutMs = DOWNLOAD_TIMEOUT) {
|
|
|
221
222
|
}
|
|
222
223
|
|
|
223
224
|
/**
|
|
224
|
-
*
|
|
225
|
-
*
|
|
226
|
-
*
|
|
227
|
-
*
|
|
228
|
-
*
|
|
225
|
+
* Detect archive format from a path/URL extension.
|
|
226
|
+
* URL-derived names are reliable enough here: PyPI's `urls[].packagetype`
|
|
227
|
+
* + filename are authoritative, npm tarballs are always `.tgz`. Returns
|
|
228
|
+
* 'targz', 'zip', or 'unknown'. Callers either pass an `options.format`
|
|
229
|
+
* override or trust this detection.
|
|
230
|
+
*
|
|
231
|
+
* @param {string} archivePath - Path or URL ending in the archive filename
|
|
232
|
+
* @returns {'targz'|'zip'|'unknown'}
|
|
229
233
|
*/
|
|
230
|
-
function
|
|
234
|
+
function detectArchiveFormat(archivePath) {
|
|
235
|
+
if (typeof archivePath !== 'string') return 'unknown';
|
|
236
|
+
const lower = archivePath.toLowerCase();
|
|
237
|
+
if (lower.endsWith('.tar.gz') || lower.endsWith('.tgz')) return 'targz';
|
|
238
|
+
if (lower.endsWith('.whl') || lower.endsWith('.zip')) return 'zip';
|
|
239
|
+
return 'unknown';
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Extract a tar.gz tarball with the system `tar` binary. Used for npm
|
|
244
|
+
* tarballs and PyPI sdists. Internal implementation — call extractArchive
|
|
245
|
+
* for new code; extractTarGz remains as a thin wrapper for the existing
|
|
246
|
+
* scanner/temporal-ast-diff.js callsite.
|
|
247
|
+
*/
|
|
248
|
+
function _extractTarGzImpl(tgzPath, destDir) {
|
|
231
249
|
// Use cwd + relative paths so C: never appears in tar arguments
|
|
232
250
|
// (GNU tar treats C: as remote host, bsdtar doesn't support --force-local)
|
|
233
251
|
const tgzDir = path.dirname(path.resolve(tgzPath));
|
|
@@ -258,6 +276,77 @@ function extractTarGz(tgzPath, destDir) {
|
|
|
258
276
|
return destDir;
|
|
259
277
|
}
|
|
260
278
|
|
|
279
|
+
/**
|
|
280
|
+
* Extract a ZIP archive (PyPI wheels, generic zips) to a directory.
|
|
281
|
+
* adm-zip is already a runtime dependency (used by src/ioc/scraper.js).
|
|
282
|
+
*
|
|
283
|
+
* Two hardening layers before extraction touches disk:
|
|
284
|
+
* 1. zip-slip: resolve each entry path against destDir and reject anything
|
|
285
|
+
* that escapes. path.resolve normalizes ../, mixed separators, and
|
|
286
|
+
* absolute paths in a single pass.
|
|
287
|
+
* 2. size cap: sum of uncompressed entry sizes must stay below
|
|
288
|
+
* MAX_TARBALL_SIZE — defends against zip bombs that pass tarball
|
|
289
|
+
* size checks but expand into multi-GB on disk.
|
|
290
|
+
*/
|
|
291
|
+
function _extractZipImpl(zipPath, destDir) {
|
|
292
|
+
const zip = new AdmZip(zipPath);
|
|
293
|
+
const entries = zip.getEntries();
|
|
294
|
+
const resolvedDest = path.resolve(destDir);
|
|
295
|
+
let totalUncompressed = 0;
|
|
296
|
+
for (const entry of entries) {
|
|
297
|
+
totalUncompressed += (entry.header && entry.header.size) || 0;
|
|
298
|
+
if (totalUncompressed > MAX_TARBALL_SIZE) {
|
|
299
|
+
throw new Error(
|
|
300
|
+
`Zip extract refused: total uncompressed size ${totalUncompressed} exceeds ${MAX_TARBALL_SIZE}`
|
|
301
|
+
);
|
|
302
|
+
}
|
|
303
|
+
const target = path.resolve(destDir, entry.entryName);
|
|
304
|
+
if (target !== resolvedDest && !target.startsWith(resolvedDest + path.sep)) {
|
|
305
|
+
throw new Error(`Unsafe zip entry escapes destDir: ${entry.entryName}`);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
zip.extractAllTo(destDir, /* overwrite */ true);
|
|
309
|
+
// Wheels carry a flat layout (no leading `package/`); collapse into the
|
|
310
|
+
// single top-level dir if there is exactly one (matches sdist behavior so
|
|
311
|
+
// the scanner pipeline can treat the result uniformly).
|
|
312
|
+
try {
|
|
313
|
+
const top = fs.readdirSync(destDir);
|
|
314
|
+
if (top.length === 1) {
|
|
315
|
+
const single = path.join(destDir, top[0]);
|
|
316
|
+
const stat = fs.lstatSync(single);
|
|
317
|
+
if (!stat.isSymbolicLink() && stat.isDirectory()) return single;
|
|
318
|
+
}
|
|
319
|
+
} catch { /* ignore — fall back to destDir */ }
|
|
320
|
+
return destDir;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* Extract an archive to a directory, dispatching on file extension.
|
|
325
|
+
* Supports `.tar.gz` / `.tgz` (tar) and `.whl` / `.zip` (adm-zip).
|
|
326
|
+
*
|
|
327
|
+
* @param {string} archivePath - Path to the archive on disk
|
|
328
|
+
* @param {string} destDir - Destination directory (must exist)
|
|
329
|
+
* @param {Object} [options]
|
|
330
|
+
* @param {'targz'|'zip'} [options.format] - override auto-detection
|
|
331
|
+
* @returns {string} Path to extracted package root
|
|
332
|
+
* @throws {Error} when the format is unknown or extraction fails
|
|
333
|
+
*/
|
|
334
|
+
function extractArchive(archivePath, destDir, options = {}) {
|
|
335
|
+
const format = options.format || detectArchiveFormat(archivePath);
|
|
336
|
+
if (format === 'targz') return _extractTarGzImpl(archivePath, destDir);
|
|
337
|
+
if (format === 'zip') return _extractZipImpl(archivePath, destDir);
|
|
338
|
+
throw new Error(`Unsupported archive format for ${path.basename(archivePath)}`);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Backwards-compatible wrapper for the original tar.gz-only extractor.
|
|
343
|
+
* Kept because src/scanner/temporal-ast-diff.js and existing tests still
|
|
344
|
+
* import it by name. New code should call extractArchive instead.
|
|
345
|
+
*/
|
|
346
|
+
function extractTarGz(tgzPath, destDir) {
|
|
347
|
+
return _extractTarGzImpl(tgzPath, destDir);
|
|
348
|
+
}
|
|
349
|
+
|
|
261
350
|
/**
|
|
262
351
|
* Sanitize a package name for use in temporary directory names.
|
|
263
352
|
* Removes path traversal sequences, slashes, and @ symbols.
|
|
@@ -277,6 +366,8 @@ function sanitizePackageName(packageName) {
|
|
|
277
366
|
module.exports = {
|
|
278
367
|
downloadToFile,
|
|
279
368
|
extractTarGz,
|
|
369
|
+
extractArchive,
|
|
370
|
+
detectArchiveFormat,
|
|
280
371
|
sanitizePackageName,
|
|
281
372
|
isAllowedDownloadRedirect,
|
|
282
373
|
normalizeHostname,
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Unicode invisible character helpers — shared by obfuscation.js and ai-config.js.
|
|
5
|
+
*
|
|
6
|
+
* Extracted v2.11.25 (TrapDoor campaign, mai 2026) : la fonction locale dans
|
|
7
|
+
* obfuscation.js couvrait `.js/.cjs/.mjs/.ts/.tsx/.py` mais pas les configs IA
|
|
8
|
+
* (.cursorrules, CLAUDE.md). En la partageant, ai-config.js peut normaliser le
|
|
9
|
+
* contenu avant ses regex et bloquer le vecteur "cu<U+200B>rl|sh" avec ZW
|
|
10
|
+
* interspersés dans le mot-clé.
|
|
11
|
+
*
|
|
12
|
+
* Codepoints détectés (superset du scope original obfuscation.js, qui n'incluait
|
|
13
|
+
* pas LRM/RLM ni les directional override) :
|
|
14
|
+
*
|
|
15
|
+
* Zero-width:
|
|
16
|
+
* U+200B ZWSP, U+200C ZWNJ, U+200D ZWJ
|
|
17
|
+
* U+2060 word joiner
|
|
18
|
+
* U+180E Mongolian vowel separator
|
|
19
|
+
*
|
|
20
|
+
* Directional (bidi spoofing — Trojan Source CVE-2021-42574) :
|
|
21
|
+
* U+200E LRM, U+200F RLM
|
|
22
|
+
* U+202A LRE, U+202B RLE, U+202C PDF, U+202D LRO, U+202E RLO
|
|
23
|
+
*
|
|
24
|
+
* Invisible math operators (peuvent casser un parser sans être vus) :
|
|
25
|
+
* U+2061 function application, U+2062 invisible times,
|
|
26
|
+
* U+2063 invisible separator, U+2064 invisible plus
|
|
27
|
+
*
|
|
28
|
+
* BOM (mid-text only; position 0 est légitime UTF-8 BOM) :
|
|
29
|
+
* U+FEFF
|
|
30
|
+
*
|
|
31
|
+
* Variation selectors :
|
|
32
|
+
* U+FE00-FE0E (excludes U+FE0F emoji presentation selector — légitime)
|
|
33
|
+
* U+E0100-E01EF supplementary plane variation selectors
|
|
34
|
+
*
|
|
35
|
+
* Tag characters (utilisés par GlassWorm pour encoder du payload) :
|
|
36
|
+
* U+E0001, U+E0020-E007F
|
|
37
|
+
*
|
|
38
|
+
* CJK, accents, emoji standards (avec U+FE0F) sont volontairement EXCLUS — pas
|
|
39
|
+
* de FP attendu sur du contenu international légitime.
|
|
40
|
+
*
|
|
41
|
+
* Références :
|
|
42
|
+
* - https://www.aikido.dev/blog/glassworm-returns-unicode-attack-github-npm-vscode
|
|
43
|
+
* - https://trojansource.codes/ (Trojan Source, CVE-2021-42574)
|
|
44
|
+
* - https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates (mai 2026)
|
|
45
|
+
*/
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Returns true if the codepoint at position `i` is considered invisible.
|
|
49
|
+
* Sets `skipNext` true on the result if the codepoint is supplementary
|
|
50
|
+
* (caller must `i++` to skip the low surrogate half).
|
|
51
|
+
*
|
|
52
|
+
* @param {string} content
|
|
53
|
+
* @param {number} i
|
|
54
|
+
* @returns {{ invisible: boolean, supplementary: boolean }}
|
|
55
|
+
*/
|
|
56
|
+
function inspectCodepoint(content, i) {
|
|
57
|
+
const cp = content.codePointAt(i);
|
|
58
|
+
|
|
59
|
+
// BMP zero-width
|
|
60
|
+
if (cp === 0x200B || cp === 0x200C || cp === 0x200D) {
|
|
61
|
+
return { invisible: true, supplementary: false };
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// BMP directional (Trojan Source)
|
|
65
|
+
if (cp === 0x200E || cp === 0x200F ||
|
|
66
|
+
(cp >= 0x202A && cp <= 0x202E)) {
|
|
67
|
+
return { invisible: true, supplementary: false };
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// BMP word joiner & friends
|
|
71
|
+
if (cp === 0x2060 || cp === 0x180E) {
|
|
72
|
+
return { invisible: true, supplementary: false };
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// BMP invisible math operators (U+2061-2064)
|
|
76
|
+
if (cp >= 0x2061 && cp <= 0x2064) {
|
|
77
|
+
return { invisible: true, supplementary: false };
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// BOM only suspicious after position 0
|
|
81
|
+
if (cp === 0xFEFF && i > 0) {
|
|
82
|
+
return { invisible: true, supplementary: false };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// BMP variation selectors (U+FE00-U+FE0E) — excludes U+FE0F emoji presentation
|
|
86
|
+
if (cp >= 0xFE00 && cp <= 0xFE0E) {
|
|
87
|
+
return { invisible: true, supplementary: false };
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Supplementary plane: variation selectors supplement (U+E0100-U+E01EF)
|
|
91
|
+
if (cp >= 0xE0100 && cp <= 0xE01EF) {
|
|
92
|
+
return { invisible: true, supplementary: true };
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Supplementary plane: tag characters (U+E0001 + U+E0020-U+E007F)
|
|
96
|
+
if (cp === 0xE0001 || (cp >= 0xE0020 && cp <= 0xE007F)) {
|
|
97
|
+
return { invisible: true, supplementary: true };
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Other supplementary chars (non-invisible) — need to skip low surrogate
|
|
101
|
+
if (cp > 0xFFFF) {
|
|
102
|
+
return { invisible: false, supplementary: true };
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return { invisible: false, supplementary: false };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Count invisible Unicode codepoints in `content`.
|
|
110
|
+
*
|
|
111
|
+
* @param {string} content
|
|
112
|
+
* @returns {number}
|
|
113
|
+
*/
|
|
114
|
+
function countInvisibleUnicode(content) {
|
|
115
|
+
let count = 0;
|
|
116
|
+
for (let i = 0; i < content.length; i++) {
|
|
117
|
+
const { invisible, supplementary } = inspectCodepoint(content, i);
|
|
118
|
+
if (invisible) count++;
|
|
119
|
+
if (supplementary) i++; // skip low surrogate half
|
|
120
|
+
}
|
|
121
|
+
return count;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Return a copy of `content` with all invisible codepoints removed.
|
|
126
|
+
*
|
|
127
|
+
* Used to normalize text before pattern matching: prevents an attacker
|
|
128
|
+
* from splitting a keyword (`cu<U+200B>rl`) with zero-width chars to evade
|
|
129
|
+
* regex like /curl\s+/i.
|
|
130
|
+
*
|
|
131
|
+
* @param {string} content
|
|
132
|
+
* @returns {string}
|
|
133
|
+
*/
|
|
134
|
+
function stripInvisibleUnicode(content) {
|
|
135
|
+
// Fast path: if no codepoint > 0x7F, content is pure ASCII — nothing to strip.
|
|
136
|
+
let hasHighChar = false;
|
|
137
|
+
for (let i = 0; i < content.length; i++) {
|
|
138
|
+
if (content.charCodeAt(i) > 0x7F) { hasHighChar = true; break; }
|
|
139
|
+
}
|
|
140
|
+
if (!hasHighChar) return content;
|
|
141
|
+
|
|
142
|
+
let out = '';
|
|
143
|
+
for (let i = 0; i < content.length; i++) {
|
|
144
|
+
const { invisible, supplementary } = inspectCodepoint(content, i);
|
|
145
|
+
if (!invisible) {
|
|
146
|
+
// Preserve original char(s). For supplementary, copy both surrogate halves.
|
|
147
|
+
if (supplementary) {
|
|
148
|
+
out += content[i] + content[i + 1];
|
|
149
|
+
i++;
|
|
150
|
+
} else {
|
|
151
|
+
out += content[i];
|
|
152
|
+
}
|
|
153
|
+
} else if (supplementary) {
|
|
154
|
+
// Skip both surrogate halves
|
|
155
|
+
i++;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
return out;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
module.exports = {
|
|
162
|
+
countInvisibleUnicode,
|
|
163
|
+
stripInvisibleUnicode
|
|
164
|
+
};
|