muaddib-scanner 2.9.7 → 2.9.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.9.7",
3
+ "version": "2.9.8",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -0,0 +1,339 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * MUAD'DIB — npm Random Package Sampler
4
+ *
5
+ * Samples 200 packages from the npm registry by stratified random sampling.
6
+ * Used to measure FPR on a representative npm sample (not curated).
7
+ *
8
+ * Strata (by dependency count):
9
+ * small (<10 deps): 80 packages (40%)
10
+ * medium (10-50 deps): 60 packages (30%)
11
+ * large (50-100 deps): 40 packages (20%)
12
+ * vlarge (100+ deps): 20 packages (10%)
13
+ *
14
+ * Exclusions: @types/*, deprecated, already in packages-npm.txt
15
+ *
16
+ * Usage:
17
+ * node scripts/sample-npm-random.js [--seed N] [--output path]
18
+ */
19
+
20
+ const https = require('https');
21
+ const fs = require('fs');
22
+ const path = require('path');
23
+
24
+ const ROOT = path.join(__dirname, '..');
25
+ const CURATED_FILE = path.join(ROOT, 'datasets', 'benign', 'packages-npm.txt');
26
+ const DEFAULT_OUTPUT = path.join(ROOT, 'datasets', 'benign', 'packages-npm-random.txt');
27
+
28
+ const STRATA = {
29
+ small: { min: 0, max: 9, quota: 80 },
30
+ medium: { min: 10, max: 50, quota: 60 },
31
+ large: { min: 51, max: 100, quota: 40 },
32
+ vlarge: { min: 101, max: Infinity, quota: 20 }
33
+ };
34
+
35
+ // Search keywords — diverse enough to sample across npm
36
+ const SEARCH_KEYWORDS = [
37
+ 'util', 'helper', 'config', 'server', 'client', 'api', 'data',
38
+ 'file', 'string', 'array', 'json', 'http', 'url', 'path', 'stream',
39
+ 'log', 'debug', 'test', 'mock', 'format', 'parse', 'transform',
40
+ 'crypto', 'hash', 'encode', 'decode', 'compress', 'cache', 'queue',
41
+ 'event', 'promise', 'async', 'callback', 'middleware', 'router',
42
+ 'database', 'mongo', 'redis', 'sql', 'orm', 'schema', 'validate',
43
+ 'cli', 'terminal', 'color', 'progress', 'spinner', 'prompt',
44
+ 'image', 'pdf', 'csv', 'xml', 'yaml', 'markdown', 'html',
45
+ 'email', 'auth', 'token', 'session', 'cookie', 'proxy',
46
+ 'date', 'time', 'math', 'random', 'uuid', 'id', 'slug',
47
+ 'webpack', 'babel', 'eslint', 'prettier', 'rollup', 'vite',
48
+ 'react', 'vue', 'angular', 'svelte', 'solid', 'preact',
49
+ 'express', 'koa', 'fastify', 'socket', 'graphql', 'rest',
50
+ 'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'ci',
51
+ 'i18n', 'locale', 'charset', 'buffer', 'binary', 'hex',
52
+ 'retry', 'timeout', 'rate', 'limit', 'throttle', 'debounce',
53
+ 'merge', 'deep', 'clone', 'diff', 'patch', 'compare',
54
+ 'glob', 'pattern', 'regex', 'match', 'search', 'filter',
55
+ 'tree', 'graph', 'list', 'map', 'set', 'stack',
56
+ 'plugin', 'loader', 'adapter', 'wrapper', 'bridge', 'connector'
57
+ ];
58
+
59
+ // Seeded PRNG (mulberry32) for reproducibility
60
+ function mulberry32(seed) {
61
+ return function() {
62
+ seed |= 0; seed = seed + 0x6D2B79F5 | 0;
63
+ let t = Math.imul(seed ^ seed >>> 15, 1 | seed);
64
+ t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
65
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
66
+ };
67
+ }
68
+
69
+ function shuffleArray(arr, rng) {
70
+ for (let i = arr.length - 1; i > 0; i--) {
71
+ const j = Math.floor(rng() * (i + 1));
72
+ [arr[i], arr[j]] = [arr[j], arr[i]];
73
+ }
74
+ return arr;
75
+ }
76
+
77
+ function httpsGet(url) {
78
+ return new Promise((resolve, reject) => {
79
+ const req = https.get(url, { timeout: 15000 }, (res) => {
80
+ if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
81
+ httpsGet(res.headers.location).then(resolve).catch(reject);
82
+ return;
83
+ }
84
+ if (res.statusCode !== 200) {
85
+ res.resume();
86
+ reject(new Error(`HTTP ${res.statusCode} for ${url}`));
87
+ return;
88
+ }
89
+ let data = '';
90
+ res.on('data', chunk => data += chunk);
91
+ res.on('end', () => {
92
+ try { resolve(JSON.parse(data)); }
93
+ catch (e) { reject(new Error(`JSON parse error: ${e.message}`)); }
94
+ });
95
+ });
96
+ req.on('error', reject);
97
+ req.on('timeout', () => { req.destroy(); reject(new Error('timeout')); });
98
+ });
99
+ }
100
+
101
+ /**
102
+ * Search npm registry for packages matching a keyword.
103
+ * Returns array of { name, version } objects.
104
+ */
105
+ async function searchNpm(keyword, from = 0, size = 250) {
106
+ const url = `https://registry.npmjs.org/-/v1/search?text=${encodeURIComponent(keyword)}&size=${size}&from=${from}`;
107
+ try {
108
+ const data = await httpsGet(url);
109
+ return (data.objects || []).map(o => ({
110
+ name: o.package.name,
111
+ version: o.package.version,
112
+ description: o.package.description || '',
113
+ deprecated: o.package.deprecated || false
114
+ }));
115
+ } catch (err) {
116
+ console.error(` [WARN] npm search "${keyword}" failed: ${err.message}`);
117
+ return [];
118
+ }
119
+ }
120
+
121
+ /**
122
+ * Get dependency count for a package via npm view.
123
+ * Returns { deps, devDeps } or null on failure.
124
+ */
125
+ async function getDepCount(pkgName) {
126
+ const url = `https://registry.npmjs.org/${encodeURIComponent(pkgName)}/latest`;
127
+ try {
128
+ const data = await httpsGet(url);
129
+ const deps = data.dependencies ? Object.keys(data.dependencies).length : 0;
130
+ const devDeps = data.devDependencies ? Object.keys(data.devDependencies).length : 0;
131
+ return { deps, devDeps, totalDeps: deps + devDeps };
132
+ } catch {
133
+ return null;
134
+ }
135
+ }
136
+
137
+ function classifyStratum(depCount) {
138
+ for (const [name, { min, max }] of Object.entries(STRATA)) {
139
+ if (depCount >= min && depCount <= max) return name;
140
+ }
141
+ return 'small';
142
+ }
143
+
144
+ function loadCuratedPackages() {
145
+ try {
146
+ return new Set(
147
+ fs.readFileSync(CURATED_FILE, 'utf8')
148
+ .split(/\r?\n/)
149
+ .map(l => l.trim())
150
+ .filter(l => l && !l.startsWith('#'))
151
+ );
152
+ } catch {
153
+ return new Set();
154
+ }
155
+ }
156
+
157
+ async function main() {
158
+ const args = process.argv.slice(2);
159
+ let seed = 42;
160
+ let outputPath = DEFAULT_OUTPUT;
161
+
162
+ for (let i = 0; i < args.length; i++) {
163
+ if (args[i] === '--seed' && args[i + 1]) { seed = parseInt(args[i + 1], 10); i++; }
164
+ if (args[i] === '--output' && args[i + 1]) { outputPath = args[i + 1]; i++; }
165
+ }
166
+
167
+ const rng = mulberry32(seed);
168
+ const curated = loadCuratedPackages();
169
+ console.log(` Loaded ${curated.size} curated packages to exclude`);
170
+ console.log(` Seed: ${seed}`);
171
+
172
+ // Phase 1: Collect candidate packages from npm search
173
+ console.log(`\n [1/3] Collecting candidates from npm search...`);
174
+ const candidates = new Map(); // name -> { name, version, description }
175
+ const shuffledKeywords = shuffleArray([...SEARCH_KEYWORDS], rng);
176
+
177
+ for (let i = 0; i < shuffledKeywords.length; i++) {
178
+ const keyword = shuffledKeywords[i];
179
+ if (process.stdout.isTTY) {
180
+ process.stdout.write(`\r Searching "${keyword}" (${i + 1}/${shuffledKeywords.length})... `);
181
+ }
182
+
183
+ // Search with random offset for diversity
184
+ const offset = Math.floor(rng() * 200);
185
+ const results = await searchNpm(keyword, offset, 250);
186
+
187
+ for (const pkg of results) {
188
+ // Exclusion filters
189
+ if (candidates.has(pkg.name)) continue;
190
+ if (curated.has(pkg.name)) continue;
191
+ if (pkg.name.startsWith('@types/')) continue;
192
+ if (pkg.deprecated) continue;
193
+ if (pkg.name.startsWith('_')) continue;
194
+
195
+ candidates.set(pkg.name, pkg);
196
+ }
197
+
198
+ // Stop early if we have enough candidates
199
+ if (candidates.size >= 2000) break;
200
+
201
+ // Rate limiting: ~100ms between requests
202
+ await new Promise(r => setTimeout(r, 100));
203
+ }
204
+
205
+ if (process.stdout.isTTY) {
206
+ process.stdout.write('\r' + ''.padEnd(80) + '\r');
207
+ }
208
+ console.log(` Collected ${candidates.size} unique candidates`);
209
+
210
+ // Phase 2: Classify by dependency count
211
+ // Over-collect: allow 2x quota per stratum to enable backfill
212
+ console.log(`\n [2/3] Classifying by dependency count...`);
213
+ const buckets = { small: [], medium: [], large: [], vlarge: [] };
214
+ const candidateList = shuffleArray([...candidates.keys()], rng);
215
+
216
+ const totalQuota = Object.values(STRATA).reduce((s, v) => s + v.quota, 0);
217
+ let classified = 0;
218
+ let processed = 0;
219
+ // Over-collect limit: 2x quota per stratum to provide backfill pool
220
+ const OVER_COLLECT = 2;
221
+
222
+ for (const pkgName of candidateList) {
223
+ // Check if all buckets have enough for backfill
224
+ const allOverCollected = Object.entries(STRATA).every(
225
+ ([name, { quota }]) => buckets[name].length >= quota * OVER_COLLECT
226
+ );
227
+ if (allOverCollected) break;
228
+
229
+ processed++;
230
+ if (process.stdout.isTTY && processed % 10 === 0) {
231
+ const bucketStatus = Object.entries(buckets).map(([k, v]) => `${k}:${v.length}/${STRATA[k].quota}`).join(' ');
232
+ process.stdout.write(`\r Classifying [${processed}/${candidateList.length}] ${bucketStatus} `);
233
+ }
234
+
235
+ const info = await getDepCount(pkgName);
236
+ if (!info) continue;
237
+
238
+ const stratum = classifyStratum(info.totalDeps);
239
+ if (buckets[stratum].length < STRATA[stratum].quota * OVER_COLLECT) {
240
+ buckets[stratum].push({ name: pkgName, deps: info.totalDeps, stratum });
241
+ classified++;
242
+ }
243
+
244
+ // Rate limiting
245
+ await new Promise(r => setTimeout(r, 50));
246
+ }
247
+
248
+ if (process.stdout.isTTY) {
249
+ process.stdout.write('\r' + ''.padEnd(80) + '\r');
250
+ }
251
+
252
+ // Phase 3: Output with backfill
253
+ // If large/vlarge strata can't meet quota, redistribute remaining slots
254
+ // to small/medium proportionally (reflects real npm distribution).
255
+ console.log(`\n [3/3] Writing results...`);
256
+ const selected = [];
257
+ let deficit = 0;
258
+ for (const [name, { quota }] of Object.entries(STRATA)) {
259
+ const actual = Math.min(buckets[name].length, quota);
260
+ console.log(` ${name}: ${actual}/${quota} packages`);
261
+ selected.push(...buckets[name].slice(0, actual));
262
+ deficit += quota - actual;
263
+ }
264
+
265
+ // Backfill deficit from small/medium overflow (proportional)
266
+ if (deficit > 0) {
267
+ console.log(` Backfilling ${deficit} slots from small/medium overflow...`);
268
+ const backfillSources = ['small', 'medium']; // priority order
269
+ for (const src of backfillSources) {
270
+ if (deficit <= 0) break;
271
+ const overflow = buckets[src].slice(STRATA[src].quota);
272
+ const take = Math.min(overflow.length, deficit);
273
+ if (take > 0) {
274
+ selected.push(...overflow.slice(0, take));
275
+ deficit -= take;
276
+ console.log(` +${take} from ${src} overflow`);
277
+ }
278
+ }
279
+ }
280
+
281
+ const totalSelected = selected.length;
282
+ console.log(`\n Total: ${totalSelected}/200 packages`);
283
+
284
+ if (totalSelected < 200) {
285
+ console.warn(`\n [WARN] Only ${totalSelected} packages found. Re-run with different --seed or add more search keywords.`);
286
+ }
287
+
288
+ // Write output file
289
+ // Use a Set to track already-written packages (avoid duplication from backfill)
290
+ const writtenNames = new Set();
291
+ const header = [
292
+ '# MUAD\'DIB Benign Random Dataset — npm stratified random sample',
293
+ `# Generated: ${new Date().toISOString()}`,
294
+ `# Seed: ${seed}`,
295
+ `# Total: ${totalSelected} packages`,
296
+ '# Strata: small (<10 deps): 80, medium (10-50): 60, large (51-100): 40, vlarge (100+): 20',
297
+ '# Backfill: unfilled large/vlarge slots redistributed to small/medium',
298
+ '# Used by `muaddib evaluate` to measure FPR on representative npm sample',
299
+ ''
300
+ ];
301
+
302
+ const lines = [];
303
+ for (const [name, { quota }] of Object.entries(STRATA)) {
304
+ const actual = Math.min(buckets[name].length, quota);
305
+ lines.push(`# === ${name} (${actual}/${quota}) ===`);
306
+ for (const pkg of buckets[name].slice(0, actual)) {
307
+ lines.push(pkg.name);
308
+ writtenNames.add(pkg.name);
309
+ }
310
+ lines.push('');
311
+ }
312
+
313
+ // Backfill section (additional packages from overflow)
314
+ const backfillPkgs = selected.filter(p => !writtenNames.has(p.name));
315
+ if (backfillPkgs.length > 0) {
316
+ lines.push(`# === backfill (${backfillPkgs.length}) ===`);
317
+ for (const pkg of backfillPkgs) {
318
+ lines.push(pkg.name);
319
+ }
320
+ lines.push('');
321
+ }
322
+
323
+ fs.mkdirSync(path.dirname(outputPath), { recursive: true });
324
+ fs.writeFileSync(outputPath, header.join('\n') + lines.join('\n'));
325
+ console.log(` Written to: ${path.relative(ROOT, outputPath)}`);
326
+
327
+ // Verify no overlap with curated
328
+ const overlap = selected.filter(p => curated.has(p.name));
329
+ if (overlap.length > 0) {
330
+ console.error(`\n [ERROR] ${overlap.length} packages overlap with curated corpus: ${overlap.map(p => p.name).join(', ')}`);
331
+ } else {
332
+ console.log(' No overlap with curated corpus');
333
+ }
334
+ }
335
+
336
+ main().catch(err => {
337
+ console.error(`[ERROR] ${err.message}`);
338
+ process.exit(1);
339
+ });