muaddib-scanner 2.9.7 → 2.9.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/sample-npm-random.js +339 -0
package/package.json
CHANGED
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* MUAD'DIB — npm Random Package Sampler
|
|
4
|
+
*
|
|
5
|
+
* Samples 200 packages from the npm registry by stratified random sampling.
|
|
6
|
+
* Used to measure FPR on a representative npm sample (not curated).
|
|
7
|
+
*
|
|
8
|
+
* Strata (by dependency count):
|
|
9
|
+
* small (<10 deps): 80 packages (40%)
|
|
10
|
+
* medium (10-50 deps): 60 packages (30%)
|
|
11
|
+
* large (50-100 deps): 40 packages (20%)
|
|
12
|
+
* vlarge (100+ deps): 20 packages (10%)
|
|
13
|
+
*
|
|
14
|
+
* Exclusions: @types/*, deprecated, already in packages-npm.txt
|
|
15
|
+
*
|
|
16
|
+
* Usage:
|
|
17
|
+
* node scripts/sample-npm-random.js [--seed N] [--output path]
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
const https = require('https');
|
|
21
|
+
const fs = require('fs');
|
|
22
|
+
const path = require('path');
|
|
23
|
+
|
|
24
|
+
const ROOT = path.join(__dirname, '..');
|
|
25
|
+
const CURATED_FILE = path.join(ROOT, 'datasets', 'benign', 'packages-npm.txt');
|
|
26
|
+
const DEFAULT_OUTPUT = path.join(ROOT, 'datasets', 'benign', 'packages-npm-random.txt');
|
|
27
|
+
|
|
28
|
+
const STRATA = {
|
|
29
|
+
small: { min: 0, max: 9, quota: 80 },
|
|
30
|
+
medium: { min: 10, max: 50, quota: 60 },
|
|
31
|
+
large: { min: 51, max: 100, quota: 40 },
|
|
32
|
+
vlarge: { min: 101, max: Infinity, quota: 20 }
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
// Search keywords — diverse enough to sample across npm
|
|
36
|
+
const SEARCH_KEYWORDS = [
|
|
37
|
+
'util', 'helper', 'config', 'server', 'client', 'api', 'data',
|
|
38
|
+
'file', 'string', 'array', 'json', 'http', 'url', 'path', 'stream',
|
|
39
|
+
'log', 'debug', 'test', 'mock', 'format', 'parse', 'transform',
|
|
40
|
+
'crypto', 'hash', 'encode', 'decode', 'compress', 'cache', 'queue',
|
|
41
|
+
'event', 'promise', 'async', 'callback', 'middleware', 'router',
|
|
42
|
+
'database', 'mongo', 'redis', 'sql', 'orm', 'schema', 'validate',
|
|
43
|
+
'cli', 'terminal', 'color', 'progress', 'spinner', 'prompt',
|
|
44
|
+
'image', 'pdf', 'csv', 'xml', 'yaml', 'markdown', 'html',
|
|
45
|
+
'email', 'auth', 'token', 'session', 'cookie', 'proxy',
|
|
46
|
+
'date', 'time', 'math', 'random', 'uuid', 'id', 'slug',
|
|
47
|
+
'webpack', 'babel', 'eslint', 'prettier', 'rollup', 'vite',
|
|
48
|
+
'react', 'vue', 'angular', 'svelte', 'solid', 'preact',
|
|
49
|
+
'express', 'koa', 'fastify', 'socket', 'graphql', 'rest',
|
|
50
|
+
'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'ci',
|
|
51
|
+
'i18n', 'locale', 'charset', 'buffer', 'binary', 'hex',
|
|
52
|
+
'retry', 'timeout', 'rate', 'limit', 'throttle', 'debounce',
|
|
53
|
+
'merge', 'deep', 'clone', 'diff', 'patch', 'compare',
|
|
54
|
+
'glob', 'pattern', 'regex', 'match', 'search', 'filter',
|
|
55
|
+
'tree', 'graph', 'list', 'map', 'set', 'stack',
|
|
56
|
+
'plugin', 'loader', 'adapter', 'wrapper', 'bridge', 'connector'
|
|
57
|
+
];
|
|
58
|
+
|
|
59
|
+
// Seeded PRNG (mulberry32) for reproducibility
|
|
60
|
+
function mulberry32(seed) {
|
|
61
|
+
return function() {
|
|
62
|
+
seed |= 0; seed = seed + 0x6D2B79F5 | 0;
|
|
63
|
+
let t = Math.imul(seed ^ seed >>> 15, 1 | seed);
|
|
64
|
+
t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
|
|
65
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function shuffleArray(arr, rng) {
|
|
70
|
+
for (let i = arr.length - 1; i > 0; i--) {
|
|
71
|
+
const j = Math.floor(rng() * (i + 1));
|
|
72
|
+
[arr[i], arr[j]] = [arr[j], arr[i]];
|
|
73
|
+
}
|
|
74
|
+
return arr;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function httpsGet(url) {
|
|
78
|
+
return new Promise((resolve, reject) => {
|
|
79
|
+
const req = https.get(url, { timeout: 15000 }, (res) => {
|
|
80
|
+
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
|
|
81
|
+
httpsGet(res.headers.location).then(resolve).catch(reject);
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
if (res.statusCode !== 200) {
|
|
85
|
+
res.resume();
|
|
86
|
+
reject(new Error(`HTTP ${res.statusCode} for ${url}`));
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
let data = '';
|
|
90
|
+
res.on('data', chunk => data += chunk);
|
|
91
|
+
res.on('end', () => {
|
|
92
|
+
try { resolve(JSON.parse(data)); }
|
|
93
|
+
catch (e) { reject(new Error(`JSON parse error: ${e.message}`)); }
|
|
94
|
+
});
|
|
95
|
+
});
|
|
96
|
+
req.on('error', reject);
|
|
97
|
+
req.on('timeout', () => { req.destroy(); reject(new Error('timeout')); });
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Search npm registry for packages matching a keyword.
|
|
103
|
+
* Returns array of { name, version } objects.
|
|
104
|
+
*/
|
|
105
|
+
async function searchNpm(keyword, from = 0, size = 250) {
|
|
106
|
+
const url = `https://registry.npmjs.org/-/v1/search?text=${encodeURIComponent(keyword)}&size=${size}&from=${from}`;
|
|
107
|
+
try {
|
|
108
|
+
const data = await httpsGet(url);
|
|
109
|
+
return (data.objects || []).map(o => ({
|
|
110
|
+
name: o.package.name,
|
|
111
|
+
version: o.package.version,
|
|
112
|
+
description: o.package.description || '',
|
|
113
|
+
deprecated: o.package.deprecated || false
|
|
114
|
+
}));
|
|
115
|
+
} catch (err) {
|
|
116
|
+
console.error(` [WARN] npm search "${keyword}" failed: ${err.message}`);
|
|
117
|
+
return [];
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Get dependency count for a package via npm view.
|
|
123
|
+
* Returns { deps, devDeps } or null on failure.
|
|
124
|
+
*/
|
|
125
|
+
async function getDepCount(pkgName) {
|
|
126
|
+
const url = `https://registry.npmjs.org/${encodeURIComponent(pkgName)}/latest`;
|
|
127
|
+
try {
|
|
128
|
+
const data = await httpsGet(url);
|
|
129
|
+
const deps = data.dependencies ? Object.keys(data.dependencies).length : 0;
|
|
130
|
+
const devDeps = data.devDependencies ? Object.keys(data.devDependencies).length : 0;
|
|
131
|
+
return { deps, devDeps, totalDeps: deps + devDeps };
|
|
132
|
+
} catch {
|
|
133
|
+
return null;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function classifyStratum(depCount) {
|
|
138
|
+
for (const [name, { min, max }] of Object.entries(STRATA)) {
|
|
139
|
+
if (depCount >= min && depCount <= max) return name;
|
|
140
|
+
}
|
|
141
|
+
return 'small';
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function loadCuratedPackages() {
|
|
145
|
+
try {
|
|
146
|
+
return new Set(
|
|
147
|
+
fs.readFileSync(CURATED_FILE, 'utf8')
|
|
148
|
+
.split(/\r?\n/)
|
|
149
|
+
.map(l => l.trim())
|
|
150
|
+
.filter(l => l && !l.startsWith('#'))
|
|
151
|
+
);
|
|
152
|
+
} catch {
|
|
153
|
+
return new Set();
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
async function main() {
|
|
158
|
+
const args = process.argv.slice(2);
|
|
159
|
+
let seed = 42;
|
|
160
|
+
let outputPath = DEFAULT_OUTPUT;
|
|
161
|
+
|
|
162
|
+
for (let i = 0; i < args.length; i++) {
|
|
163
|
+
if (args[i] === '--seed' && args[i + 1]) { seed = parseInt(args[i + 1], 10); i++; }
|
|
164
|
+
if (args[i] === '--output' && args[i + 1]) { outputPath = args[i + 1]; i++; }
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const rng = mulberry32(seed);
|
|
168
|
+
const curated = loadCuratedPackages();
|
|
169
|
+
console.log(` Loaded ${curated.size} curated packages to exclude`);
|
|
170
|
+
console.log(` Seed: ${seed}`);
|
|
171
|
+
|
|
172
|
+
// Phase 1: Collect candidate packages from npm search
|
|
173
|
+
console.log(`\n [1/3] Collecting candidates from npm search...`);
|
|
174
|
+
const candidates = new Map(); // name -> { name, version, description }
|
|
175
|
+
const shuffledKeywords = shuffleArray([...SEARCH_KEYWORDS], rng);
|
|
176
|
+
|
|
177
|
+
for (let i = 0; i < shuffledKeywords.length; i++) {
|
|
178
|
+
const keyword = shuffledKeywords[i];
|
|
179
|
+
if (process.stdout.isTTY) {
|
|
180
|
+
process.stdout.write(`\r Searching "${keyword}" (${i + 1}/${shuffledKeywords.length})... `);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Search with random offset for diversity
|
|
184
|
+
const offset = Math.floor(rng() * 200);
|
|
185
|
+
const results = await searchNpm(keyword, offset, 250);
|
|
186
|
+
|
|
187
|
+
for (const pkg of results) {
|
|
188
|
+
// Exclusion filters
|
|
189
|
+
if (candidates.has(pkg.name)) continue;
|
|
190
|
+
if (curated.has(pkg.name)) continue;
|
|
191
|
+
if (pkg.name.startsWith('@types/')) continue;
|
|
192
|
+
if (pkg.deprecated) continue;
|
|
193
|
+
if (pkg.name.startsWith('_')) continue;
|
|
194
|
+
|
|
195
|
+
candidates.set(pkg.name, pkg);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// Stop early if we have enough candidates
|
|
199
|
+
if (candidates.size >= 2000) break;
|
|
200
|
+
|
|
201
|
+
// Rate limiting: ~100ms between requests
|
|
202
|
+
await new Promise(r => setTimeout(r, 100));
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if (process.stdout.isTTY) {
|
|
206
|
+
process.stdout.write('\r' + ''.padEnd(80) + '\r');
|
|
207
|
+
}
|
|
208
|
+
console.log(` Collected ${candidates.size} unique candidates`);
|
|
209
|
+
|
|
210
|
+
// Phase 2: Classify by dependency count
|
|
211
|
+
// Over-collect: allow 2x quota per stratum to enable backfill
|
|
212
|
+
console.log(`\n [2/3] Classifying by dependency count...`);
|
|
213
|
+
const buckets = { small: [], medium: [], large: [], vlarge: [] };
|
|
214
|
+
const candidateList = shuffleArray([...candidates.keys()], rng);
|
|
215
|
+
|
|
216
|
+
const totalQuota = Object.values(STRATA).reduce((s, v) => s + v.quota, 0);
|
|
217
|
+
let classified = 0;
|
|
218
|
+
let processed = 0;
|
|
219
|
+
// Over-collect limit: 2x quota per stratum to provide backfill pool
|
|
220
|
+
const OVER_COLLECT = 2;
|
|
221
|
+
|
|
222
|
+
for (const pkgName of candidateList) {
|
|
223
|
+
// Check if all buckets have enough for backfill
|
|
224
|
+
const allOverCollected = Object.entries(STRATA).every(
|
|
225
|
+
([name, { quota }]) => buckets[name].length >= quota * OVER_COLLECT
|
|
226
|
+
);
|
|
227
|
+
if (allOverCollected) break;
|
|
228
|
+
|
|
229
|
+
processed++;
|
|
230
|
+
if (process.stdout.isTTY && processed % 10 === 0) {
|
|
231
|
+
const bucketStatus = Object.entries(buckets).map(([k, v]) => `${k}:${v.length}/${STRATA[k].quota}`).join(' ');
|
|
232
|
+
process.stdout.write(`\r Classifying [${processed}/${candidateList.length}] ${bucketStatus} `);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const info = await getDepCount(pkgName);
|
|
236
|
+
if (!info) continue;
|
|
237
|
+
|
|
238
|
+
const stratum = classifyStratum(info.totalDeps);
|
|
239
|
+
if (buckets[stratum].length < STRATA[stratum].quota * OVER_COLLECT) {
|
|
240
|
+
buckets[stratum].push({ name: pkgName, deps: info.totalDeps, stratum });
|
|
241
|
+
classified++;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// Rate limiting
|
|
245
|
+
await new Promise(r => setTimeout(r, 50));
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if (process.stdout.isTTY) {
|
|
249
|
+
process.stdout.write('\r' + ''.padEnd(80) + '\r');
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Phase 3: Output with backfill
|
|
253
|
+
// If large/vlarge strata can't meet quota, redistribute remaining slots
|
|
254
|
+
// to small/medium proportionally (reflects real npm distribution).
|
|
255
|
+
console.log(`\n [3/3] Writing results...`);
|
|
256
|
+
const selected = [];
|
|
257
|
+
let deficit = 0;
|
|
258
|
+
for (const [name, { quota }] of Object.entries(STRATA)) {
|
|
259
|
+
const actual = Math.min(buckets[name].length, quota);
|
|
260
|
+
console.log(` ${name}: ${actual}/${quota} packages`);
|
|
261
|
+
selected.push(...buckets[name].slice(0, actual));
|
|
262
|
+
deficit += quota - actual;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Backfill deficit from small/medium overflow (proportional)
|
|
266
|
+
if (deficit > 0) {
|
|
267
|
+
console.log(` Backfilling ${deficit} slots from small/medium overflow...`);
|
|
268
|
+
const backfillSources = ['small', 'medium']; // priority order
|
|
269
|
+
for (const src of backfillSources) {
|
|
270
|
+
if (deficit <= 0) break;
|
|
271
|
+
const overflow = buckets[src].slice(STRATA[src].quota);
|
|
272
|
+
const take = Math.min(overflow.length, deficit);
|
|
273
|
+
if (take > 0) {
|
|
274
|
+
selected.push(...overflow.slice(0, take));
|
|
275
|
+
deficit -= take;
|
|
276
|
+
console.log(` +${take} from ${src} overflow`);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
const totalSelected = selected.length;
|
|
282
|
+
console.log(`\n Total: ${totalSelected}/200 packages`);
|
|
283
|
+
|
|
284
|
+
if (totalSelected < 200) {
|
|
285
|
+
console.warn(`\n [WARN] Only ${totalSelected} packages found. Re-run with different --seed or add more search keywords.`);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// Write output file
|
|
289
|
+
// Use a Set to track already-written packages (avoid duplication from backfill)
|
|
290
|
+
const writtenNames = new Set();
|
|
291
|
+
const header = [
|
|
292
|
+
'# MUAD\'DIB Benign Random Dataset — npm stratified random sample',
|
|
293
|
+
`# Generated: ${new Date().toISOString()}`,
|
|
294
|
+
`# Seed: ${seed}`,
|
|
295
|
+
`# Total: ${totalSelected} packages`,
|
|
296
|
+
'# Strata: small (<10 deps): 80, medium (10-50): 60, large (51-100): 40, vlarge (100+): 20',
|
|
297
|
+
'# Backfill: unfilled large/vlarge slots redistributed to small/medium',
|
|
298
|
+
'# Used by `muaddib evaluate` to measure FPR on representative npm sample',
|
|
299
|
+
''
|
|
300
|
+
];
|
|
301
|
+
|
|
302
|
+
const lines = [];
|
|
303
|
+
for (const [name, { quota }] of Object.entries(STRATA)) {
|
|
304
|
+
const actual = Math.min(buckets[name].length, quota);
|
|
305
|
+
lines.push(`# === ${name} (${actual}/${quota}) ===`);
|
|
306
|
+
for (const pkg of buckets[name].slice(0, actual)) {
|
|
307
|
+
lines.push(pkg.name);
|
|
308
|
+
writtenNames.add(pkg.name);
|
|
309
|
+
}
|
|
310
|
+
lines.push('');
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Backfill section (additional packages from overflow)
|
|
314
|
+
const backfillPkgs = selected.filter(p => !writtenNames.has(p.name));
|
|
315
|
+
if (backfillPkgs.length > 0) {
|
|
316
|
+
lines.push(`# === backfill (${backfillPkgs.length}) ===`);
|
|
317
|
+
for (const pkg of backfillPkgs) {
|
|
318
|
+
lines.push(pkg.name);
|
|
319
|
+
}
|
|
320
|
+
lines.push('');
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
fs.mkdirSync(path.dirname(outputPath), { recursive: true });
|
|
324
|
+
fs.writeFileSync(outputPath, header.join('\n') + lines.join('\n'));
|
|
325
|
+
console.log(` Written to: ${path.relative(ROOT, outputPath)}`);
|
|
326
|
+
|
|
327
|
+
// Verify no overlap with curated
|
|
328
|
+
const overlap = selected.filter(p => curated.has(p.name));
|
|
329
|
+
if (overlap.length > 0) {
|
|
330
|
+
console.error(`\n [ERROR] ${overlap.length} packages overlap with curated corpus: ${overlap.map(p => p.name).join(', ')}`);
|
|
331
|
+
} else {
|
|
332
|
+
console.log(' No overlap with curated corpus');
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
main().catch(err => {
|
|
337
|
+
console.error(`[ERROR] ${err.message}`);
|
|
338
|
+
process.exit(1);
|
|
339
|
+
});
|