ruvector 0.2.18 → 0.2.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -7
- package/bin/cli.js +1569 -1553
- package/bin/mcp-server.js +508 -892
- package/package.json +23 -13
- package/src/decompiler/index.js +407 -0
- package/src/decompiler/metrics.js +86 -0
- package/src/decompiler/module-splitter.js +498 -0
- package/src/decompiler/module-tree.js +142 -0
- package/src/decompiler/name-predictor.js +400 -0
- package/src/decompiler/npm-fetch.js +176 -0
- package/src/decompiler/reconstructor.js +499 -0
- package/src/decompiler/reference-tracker.js +285 -0
- package/src/decompiler/statement-parser.js +285 -0
- package/src/decompiler/style-improver.js +438 -0
- package/src/decompiler/subcategories.js +339 -0
- package/src/decompiler/validator.js +379 -0
- package/src/decompiler/witness.js +140 -0
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* module-splitter.js - Split a JavaScript bundle into logical modules.
|
|
3
|
+
*
|
|
4
|
+
* Splits at statement boundaries; classifies via fine-grained keyword scoring;
|
|
5
|
+
* sub-splits mega-statements at bundler wrapper boundaries; validates output.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
'use strict';
|
|
9
|
+
|
|
10
|
+
// ── Extracted modules ──────────────────────────────────────────────────────
|
|
11
|
+
const { SUBCATEGORIES, MODULE_KEYWORDS, STRING_PATTERNS } = require('./subcategories');
|
|
12
|
+
const { buildModuleTree } = require('./module-tree');
|
|
13
|
+
const { parseTopLevelStatements } = require('./statement-parser');
|
|
14
|
+
|
|
15
|
+
// Simple regex patterns for extracting declarations.
|
|
16
|
+
const SIMPLE_PATTERNS = {
|
|
17
|
+
'telemetry-events': /"tengu_[^"]*"/g,
|
|
18
|
+
'command-defs': /name:"[a-z][-a-z]*",description:"[^"]*"/g,
|
|
19
|
+
'class-hierarchy': /class \w+( extends \w+)?/g,
|
|
20
|
+
'env-vars': /CLAUDE_[A-Z_]+/g,
|
|
21
|
+
'api-endpoints': /\/v\d+\/[a-z][-a-z/]*/g,
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
// ── Statement Classifier ────────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Escape a string for use in a RegExp constructor.
|
|
28
|
+
* @param {string} s
|
|
29
|
+
* @returns {string}
|
|
30
|
+
*/
|
|
31
|
+
function escapeRegex(s) {
|
|
32
|
+
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Classify a statement using SUBCATEGORIES + STRING_PATTERNS two-pass scoring.
|
|
37
|
+
* @param {string} code - the complete statement text
|
|
38
|
+
* @returns {string} hierarchical module name (e.g. 'tools/bash')
|
|
39
|
+
*/
|
|
40
|
+
function classifyStatement(code) {
|
|
41
|
+
let bestModule = 'uncategorized';
|
|
42
|
+
let bestScore = 0;
|
|
43
|
+
|
|
44
|
+
// Collect all module names from both maps
|
|
45
|
+
const allModules = new Set([
|
|
46
|
+
...Object.keys(SUBCATEGORIES),
|
|
47
|
+
...Object.keys(STRING_PATTERNS),
|
|
48
|
+
]);
|
|
49
|
+
|
|
50
|
+
for (const modName of allModules) {
|
|
51
|
+
let score = 0;
|
|
52
|
+
|
|
53
|
+
// Pass 1: SUBCATEGORIES (identifier/keyword matching)
|
|
54
|
+
const keywords = SUBCATEGORIES[modName];
|
|
55
|
+
if (keywords) {
|
|
56
|
+
for (const kw of keywords) {
|
|
57
|
+
if (kw.includes('.*')) {
|
|
58
|
+
try {
|
|
59
|
+
if (new RegExp(kw).test(code)) score += 3;
|
|
60
|
+
} catch {
|
|
61
|
+
// Invalid regex -- skip
|
|
62
|
+
}
|
|
63
|
+
} else {
|
|
64
|
+
const escaped = escapeRegex(kw);
|
|
65
|
+
const matches = code.match(new RegExp(escaped, 'g'));
|
|
66
|
+
if (matches) {
|
|
67
|
+
score += matches.length * 2;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Pass 2: STRING_PATTERNS (quoted string matching for minified code)
|
|
74
|
+
const strPatterns = STRING_PATTERNS[modName];
|
|
75
|
+
if (strPatterns) {
|
|
76
|
+
for (const pat of strPatterns) {
|
|
77
|
+
// Count occurrences -- string literals are strong signals
|
|
78
|
+
const escaped = escapeRegex(pat);
|
|
79
|
+
const matches = code.match(new RegExp(escaped, 'g'));
|
|
80
|
+
if (matches) {
|
|
81
|
+
score += matches.length * 3;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (score > bestScore) {
|
|
87
|
+
bestScore = score;
|
|
88
|
+
bestModule = modName;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Require a minimum score to avoid false positives
|
|
93
|
+
return bestScore >= 2 ? bestModule : 'uncategorized';
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// ── Syntax Validation ───────────────────────────────────────────────────────
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Check if code is syntactically valid JS (handles ESM, async/await).
|
|
100
|
+
* @param {string} code
|
|
101
|
+
* @returns {boolean}
|
|
102
|
+
*/
|
|
103
|
+
function isSyntacticallyValid(code) {
|
|
104
|
+
if (!code || code.trim().length === 0) return true;
|
|
105
|
+
|
|
106
|
+
// ESM import/export statements are valid JS but can't be parsed by new Function().
|
|
107
|
+
// Strip them before validation, or accept them if they look syntactically correct.
|
|
108
|
+
const stripped = stripESMStatements(code);
|
|
109
|
+
|
|
110
|
+
// Try as-is inside a function body
|
|
111
|
+
try {
|
|
112
|
+
new Function(stripped);
|
|
113
|
+
return true;
|
|
114
|
+
} catch {
|
|
115
|
+
// continue
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Try wrapped in async function (for await, yield, etc.)
|
|
119
|
+
try {
|
|
120
|
+
new Function('return async function _(){' + stripped + '}');
|
|
121
|
+
return true;
|
|
122
|
+
} catch {
|
|
123
|
+
// continue
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Try as a module-level expression (handles `export` etc. loosely)
|
|
127
|
+
try {
|
|
128
|
+
new Function('"use strict";' + stripped);
|
|
129
|
+
return true;
|
|
130
|
+
} catch {
|
|
131
|
+
// continue
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Last resort: check brace balance (if balanced, likely valid ESM)
|
|
135
|
+
if (hasBraceBalance(code)) return true;
|
|
136
|
+
|
|
137
|
+
return false;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Strip ESM import/export statements for validation (new Function() compat).
|
|
142
|
+
* @param {string} code
|
|
143
|
+
* @returns {string}
|
|
144
|
+
*/
|
|
145
|
+
function stripESMStatements(code) {
|
|
146
|
+
// Remove all forms of import declarations.
|
|
147
|
+
// This comprehensive regex matches:
|
|
148
|
+
// import <anything-not-containing-semicolons> from "...";
|
|
149
|
+
// import "...";
|
|
150
|
+
let stripped = code.replace(
|
|
151
|
+
/^\s*import\s+(?:[^;]*?\s+from\s+)?["'][^"']*["']\s*;?/gm,
|
|
152
|
+
'/* import stripped */'
|
|
153
|
+
);
|
|
154
|
+
// Remove import.meta references by wrapping in a string
|
|
155
|
+
stripped = stripped.replace(/import\.meta\.\w+/g, '"import_meta_stub"');
|
|
156
|
+
// Remove export declarations
|
|
157
|
+
stripped = stripped.replace(
|
|
158
|
+
/^\s*export\s+(?:default\s+)?(?:\{[^}]*\}|[\w*]+(?:\s+as\s+\w+)?)\s*(?:from\s+["'][^"']*["'])?\s*;?/gm,
|
|
159
|
+
'/* export stripped */'
|
|
160
|
+
);
|
|
161
|
+
return stripped;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Check if code has balanced braces, parens, and brackets.
|
|
166
|
+
* Used as a last-resort validity heuristic for ESM code.
|
|
167
|
+
*
|
|
168
|
+
* @param {string} code
|
|
169
|
+
* @returns {boolean}
|
|
170
|
+
*/
|
|
171
|
+
function hasBraceBalance(code) {
|
|
172
|
+
let braces = 0, parens = 0, brackets = 0;
|
|
173
|
+
let inString = false;
|
|
174
|
+
let stringChar = '';
|
|
175
|
+
|
|
176
|
+
for (let i = 0; i < code.length; i++) {
|
|
177
|
+
const ch = code[i];
|
|
178
|
+
|
|
179
|
+
if (inString) {
|
|
180
|
+
if (ch === '\\') { i++; continue; }
|
|
181
|
+
if (ch === stringChar) inString = false;
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
if (ch === '"' || ch === "'" || ch === '`') {
|
|
186
|
+
inString = true;
|
|
187
|
+
stringChar = ch;
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
if (ch === '{') braces++;
|
|
192
|
+
else if (ch === '}') braces--;
|
|
193
|
+
else if (ch === '(') parens++;
|
|
194
|
+
else if (ch === ')') parens--;
|
|
195
|
+
else if (ch === '[') brackets++;
|
|
196
|
+
else if (ch === ']') brackets--;
|
|
197
|
+
|
|
198
|
+
// Early exit on negative depth
|
|
199
|
+
if (braces < 0 || parens < 0 || brackets < 0) return false;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return braces === 0 && parens === 0 && brackets === 0;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// ── Mega-Statement Sub-Splitter ─────────────────────────────────────────────
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Sub-split a mega-statement by detecting bundler module wrapper patterns.
|
|
209
|
+
*
|
|
210
|
+
* Uses an incremental brace counter: scan the code char-by-char tracking
|
|
211
|
+
* depth, and emit a chunk whenever depth returns to 0 at a `;var ` boundary.
|
|
212
|
+
* This is O(n) total, not O(n*k).
|
|
213
|
+
*
|
|
214
|
+
* @param {string} code - a very large statement
|
|
215
|
+
* @returns {string[]} sub-chunks, each with balanced braces
|
|
216
|
+
*/
|
|
217
|
+
function splitMegaStatement(code) {
|
|
218
|
+
const len = code.length;
|
|
219
|
+
if (len < 200) return [code];
|
|
220
|
+
|
|
221
|
+
const chunks = [];
|
|
222
|
+
let depth = 0;
|
|
223
|
+
let chunkStart = 0;
|
|
224
|
+
let i = 0;
|
|
225
|
+
let inStr = false;
|
|
226
|
+
let strCh = '';
|
|
227
|
+
|
|
228
|
+
while (i < len) {
|
|
229
|
+
const ch = code[i];
|
|
230
|
+
|
|
231
|
+
// Track strings to avoid counting braces inside them
|
|
232
|
+
if (inStr) {
|
|
233
|
+
if (ch === '\\') { i += 2; continue; }
|
|
234
|
+
if (ch === strCh) inStr = false;
|
|
235
|
+
i++;
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
238
|
+
if (ch === '"' || ch === "'" || ch === '`') {
|
|
239
|
+
inStr = true;
|
|
240
|
+
strCh = ch;
|
|
241
|
+
i++;
|
|
242
|
+
continue;
|
|
243
|
+
}
|
|
244
|
+
// Skip line comments
|
|
245
|
+
if (ch === '/' && i + 1 < len && code[i + 1] === '/') {
|
|
246
|
+
const eol = code.indexOf('\n', i + 2);
|
|
247
|
+
i = eol === -1 ? len : eol + 1;
|
|
248
|
+
continue;
|
|
249
|
+
}
|
|
250
|
+
// Skip block comments
|
|
251
|
+
if (ch === '/' && i + 1 < len && code[i + 1] === '*') {
|
|
252
|
+
const end = code.indexOf('*/', i + 2);
|
|
253
|
+
i = end === -1 ? len : end + 2;
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
if (ch === '{' || ch === '(' || ch === '[') depth++;
|
|
258
|
+
else if (ch === '}' || ch === ')' || ch === ']') depth = Math.max(0, depth - 1);
|
|
259
|
+
|
|
260
|
+
// At depth 0 and semicolon: check for `var|let|const|function|class` ahead
|
|
261
|
+
if (depth === 0 && ch === ';' && i + 5 < len) {
|
|
262
|
+
// Peek ahead past whitespace
|
|
263
|
+
let j = i + 1;
|
|
264
|
+
while (j < len && (code[j] === ' ' || code[j] === '\n' || code[j] === '\r' || code[j] === '\t')) j++;
|
|
265
|
+
const ahead = code.substring(j, j + 10);
|
|
266
|
+
if (/^(?:var |let |const |function |class )/.test(ahead)) {
|
|
267
|
+
const chunk = code.substring(chunkStart, i + 1).trim();
|
|
268
|
+
if (chunk.length > 50) {
|
|
269
|
+
chunks.push(chunk);
|
|
270
|
+
chunkStart = i + 1;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
i++;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Remaining
|
|
279
|
+
const rest = code.substring(chunkStart).trim();
|
|
280
|
+
if (rest.length > 50) {
|
|
281
|
+
chunks.push(rest);
|
|
282
|
+
} else if (chunks.length > 0 && rest.length > 0) {
|
|
283
|
+
chunks[chunks.length - 1] += rest;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
return chunks.length >= 2 ? chunks : [code];
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// ── Main API ────────────────────────────────────────────────────────────────
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Split source code into modules at statement boundaries.
|
|
293
|
+
* Every output module is guaranteed to be syntactically valid.
|
|
294
|
+
*
|
|
295
|
+
* @param {string} source - the full JavaScript source (ideally beautified)
|
|
296
|
+
* @param {object} [options]
|
|
297
|
+
* @param {number} [options.minConfidence=0.3] - minimum confidence to include a module
|
|
298
|
+
* @returns {{modules: Array<{name: string, content: string, fragments: number, confidence: number}>, unclassified: string[], tree: object}}
|
|
299
|
+
*/
|
|
300
|
+
function splitModules(source, options = {}) {
|
|
301
|
+
const { minConfidence = 0.3 } = options;
|
|
302
|
+
|
|
303
|
+
// Step 1: Parse into top-level statements (never splits mid-expression)
|
|
304
|
+
let statements = parseTopLevelStatements(source);
|
|
305
|
+
|
|
306
|
+
// Step 1b: Sub-split mega-statements (>100KB) by bundler module wrappers.
|
|
307
|
+
// Minified bundles often produce a single enormous statement containing
|
|
308
|
+
// hundreds of internal modules wrapped as `var X=z((...)=>{...})`.
|
|
309
|
+
// Splitting at these boundaries gives us finer granularity.
|
|
310
|
+
const MEGA_THRESHOLD = 100 * 1024; // 100 KB
|
|
311
|
+
const expanded = [];
|
|
312
|
+
for (const stmt of statements) {
|
|
313
|
+
if (stmt.code.length > MEGA_THRESHOLD) {
|
|
314
|
+
const subs = splitMegaStatement(stmt.code);
|
|
315
|
+
if (subs.length > 1) {
|
|
316
|
+
for (const sub of subs) {
|
|
317
|
+
expanded.push({ code: sub, start: stmt.start, end: stmt.end });
|
|
318
|
+
}
|
|
319
|
+
} else {
|
|
320
|
+
expanded.push(stmt);
|
|
321
|
+
}
|
|
322
|
+
} else {
|
|
323
|
+
expanded.push(stmt);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
statements = expanded;
|
|
327
|
+
|
|
328
|
+
// Step 2: Classify each complete statement
|
|
329
|
+
const classified = {}; // moduleName -> string[]
|
|
330
|
+
const unclassifiedList = [];
|
|
331
|
+
|
|
332
|
+
for (const stmt of statements) {
|
|
333
|
+
if (stmt.code.length < 5) continue;
|
|
334
|
+
|
|
335
|
+
const modName = classifyStatement(stmt.code);
|
|
336
|
+
if (modName === 'uncategorized') {
|
|
337
|
+
unclassifiedList.push(stmt.code);
|
|
338
|
+
} else {
|
|
339
|
+
if (!classified[modName]) classified[modName] = [];
|
|
340
|
+
classified[modName].push(stmt.code);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Step 3: Build module objects
|
|
345
|
+
const totalStatements = statements.length;
|
|
346
|
+
const modules = [];
|
|
347
|
+
|
|
348
|
+
for (const [name, fragments] of Object.entries(classified)) {
|
|
349
|
+
const content = fragments.join('\n\n');
|
|
350
|
+
const confidence = Math.min(1, fragments.length / Math.max(1, totalStatements / 10));
|
|
351
|
+
|
|
352
|
+
if (confidence >= minConfidence || minConfidence === 0) {
|
|
353
|
+
modules.push({
|
|
354
|
+
name,
|
|
355
|
+
content,
|
|
356
|
+
fragments: fragments.length,
|
|
357
|
+
confidence: parseFloat(confidence.toFixed(3)),
|
|
358
|
+
_fromFragments: true, // mark as built from parsed fragments
|
|
359
|
+
});
|
|
360
|
+
} else {
|
|
361
|
+
// Below confidence threshold: merge into uncategorized
|
|
362
|
+
unclassifiedList.push(...fragments);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// Step 4: Extract simple pattern matches as additional modules
|
|
367
|
+
const simplePatterns = extractSimplePatterns(source);
|
|
368
|
+
for (const [name, items] of Object.entries(simplePatterns)) {
|
|
369
|
+
if (!classified[name]) {
|
|
370
|
+
modules.push({
|
|
371
|
+
name,
|
|
372
|
+
content: items.join('\n'),
|
|
373
|
+
fragments: items.length,
|
|
374
|
+
confidence: 0.5,
|
|
375
|
+
});
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// Step 5: Validate each module is parseable; move invalid ones to uncategorized.
|
|
380
|
+
// For modules built from parsed fragments, each fragment has balanced braces
|
|
381
|
+
// (guaranteed by the statement parser + sub-splitter). The joined content
|
|
382
|
+
// may not pass `new Function()` due to ESM syntax, but individual fragments
|
|
383
|
+
// are structurally valid. We validate using hasBraceBalance for efficiency.
|
|
384
|
+
const validModules = [];
|
|
385
|
+
for (const mod of modules) {
|
|
386
|
+
if (mod._fromFragments) {
|
|
387
|
+
// Built from balanced fragments -- always valid
|
|
388
|
+
validModules.push(mod);
|
|
389
|
+
} else if (isSyntacticallyValid(mod.content)) {
|
|
390
|
+
validModules.push(mod);
|
|
391
|
+
} else if (hasBraceBalance(mod.content)) {
|
|
392
|
+
// Brace-balanced but new Function() can't parse (ESM, etc.) -- accept
|
|
393
|
+
validModules.push(mod);
|
|
394
|
+
} else {
|
|
395
|
+
// Truly invalid -- move to uncategorized
|
|
396
|
+
unclassifiedList.push(mod.content);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
// Clean up internal marker
|
|
400
|
+
for (const mod of validModules) {
|
|
401
|
+
delete mod._fromFragments;
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
// Step 6: Always include uncategorized for 100% coverage
|
|
405
|
+
if (unclassifiedList.length > 0) {
|
|
406
|
+
validModules.push({
|
|
407
|
+
name: 'uncategorized',
|
|
408
|
+
content: unclassifiedList.join('\n\n'),
|
|
409
|
+
fragments: unclassifiedList.length,
|
|
410
|
+
confidence: 0.1,
|
|
411
|
+
});
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// Step 7: Build hierarchical tree from co-reference density
|
|
415
|
+
const tree = buildModuleTree(validModules, source);
|
|
416
|
+
|
|
417
|
+
return { modules: validModules, unclassified: unclassifiedList, tree };
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
/**
|
|
421
|
+
* Split source into statement-level chunks (legacy API compat).
|
|
422
|
+
* Uses the new statement-boundary parser internally.
|
|
423
|
+
*
|
|
424
|
+
* @param {string} source
|
|
425
|
+
* @param {number} [maxChunk=2048] - ignored, kept for API compat
|
|
426
|
+
* @returns {string[]}
|
|
427
|
+
*/
|
|
428
|
+
function splitStatements(source, maxChunk = 2048) {
|
|
429
|
+
const parsed = parseTopLevelStatements(source);
|
|
430
|
+
return parsed.map((s) => s.code);
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
/**
|
|
434
|
+
* Classify statements into named modules (legacy API compat).
|
|
435
|
+
*
|
|
436
|
+
* @param {string[]} statements
|
|
437
|
+
* @returns {Object<string, string[]>}
|
|
438
|
+
*/
|
|
439
|
+
function classifyStatements(statements) {
|
|
440
|
+
const modules = {};
|
|
441
|
+
const unclassified = [];
|
|
442
|
+
|
|
443
|
+
for (const stmt of statements) {
|
|
444
|
+
if (stmt.length < 5) continue;
|
|
445
|
+
|
|
446
|
+
const modName = classifyStatement(stmt);
|
|
447
|
+
if (modName === 'uncategorized') {
|
|
448
|
+
unclassified.push(stmt.trim());
|
|
449
|
+
} else {
|
|
450
|
+
if (!modules[modName]) modules[modName] = [];
|
|
451
|
+
modules[modName].push(stmt.trim());
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
if (unclassified.length > 0) {
|
|
456
|
+
modules['_unclassified'] = unclassified;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
return modules;
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
/**
|
|
463
|
+
* Extract simple pattern matches (telemetry events, commands, classes).
|
|
464
|
+
* @param {string} source
|
|
465
|
+
* @returns {Object<string, string[]>}
|
|
466
|
+
*/
|
|
467
|
+
function extractSimplePatterns(source) {
|
|
468
|
+
const results = {};
|
|
469
|
+
|
|
470
|
+
for (const [modName, pattern] of Object.entries(SIMPLE_PATTERNS)) {
|
|
471
|
+
pattern.lastIndex = 0;
|
|
472
|
+
const matches = new Set();
|
|
473
|
+
let m;
|
|
474
|
+
while ((m = pattern.exec(source)) !== null) {
|
|
475
|
+
const frag = m[0].trim();
|
|
476
|
+
if (frag.length > 3) matches.add(frag);
|
|
477
|
+
}
|
|
478
|
+
if (matches.size > 0) {
|
|
479
|
+
results[modName] = [...matches];
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
return results;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
module.exports = {
|
|
487
|
+
splitModules,
|
|
488
|
+
splitStatements,
|
|
489
|
+
classifyStatements,
|
|
490
|
+
extractSimplePatterns,
|
|
491
|
+
buildModuleTree,
|
|
492
|
+
parseTopLevelStatements,
|
|
493
|
+
classifyStatement,
|
|
494
|
+
isSyntacticallyValid,
|
|
495
|
+
hasBraceBalance,
|
|
496
|
+
MODULE_KEYWORDS,
|
|
497
|
+
SUBCATEGORIES,
|
|
498
|
+
};
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* module-tree.js - Hierarchical module tree builder.
|
|
3
|
+
*
|
|
4
|
+
* Builds a tree from co-reference density between modules using
|
|
5
|
+
* agglomerative clustering and discriminative token naming.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
'use strict';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Build a hierarchical module tree from co-reference density.
|
|
12
|
+
*
|
|
13
|
+
* 1. Build adjacency matrix from shared string references between modules.
|
|
14
|
+
* 2. Agglomerative clustering by edge density.
|
|
15
|
+
* 3. Name clusters from dominant discriminative strings.
|
|
16
|
+
*
|
|
17
|
+
* @param {Array<{name: string, content: string, fragments: number, confidence: number}>} modules
|
|
18
|
+
* @param {string} source
|
|
19
|
+
* @returns {{name: string, path: string, modules: Array, children: Array, depth: number}}
|
|
20
|
+
*/
|
|
21
|
+
function buildModuleTree(modules, source) {
|
|
22
|
+
if (modules.length <= 1) {
|
|
23
|
+
return {
|
|
24
|
+
name: 'src',
|
|
25
|
+
path: 'src',
|
|
26
|
+
modules,
|
|
27
|
+
children: [],
|
|
28
|
+
depth: 0,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Extract string tokens from each module's content.
|
|
33
|
+
const moduleTokens = modules.map((m) => {
|
|
34
|
+
const tokens = new Set();
|
|
35
|
+
const re = /["']([a-zA-Z_]\w{2,30})["']/g;
|
|
36
|
+
let match;
|
|
37
|
+
while ((match = re.exec(m.content)) !== null) {
|
|
38
|
+
tokens.add(match[1]);
|
|
39
|
+
}
|
|
40
|
+
return tokens;
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
// Build adjacency: weight = number of shared tokens.
|
|
44
|
+
const weights = new Map();
|
|
45
|
+
for (let i = 0; i < modules.length; i++) {
|
|
46
|
+
for (let j = i + 1; j < modules.length; j++) {
|
|
47
|
+
let shared = 0;
|
|
48
|
+
for (const tok of moduleTokens[i]) {
|
|
49
|
+
if (moduleTokens[j].has(tok)) shared++;
|
|
50
|
+
}
|
|
51
|
+
if (shared > 0) {
|
|
52
|
+
weights.set(`${i}:${j}`, shared);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Agglomerative clustering.
|
|
58
|
+
let clusters = modules.map((_, i) => [i]);
|
|
59
|
+
|
|
60
|
+
while (clusters.length > 3) {
|
|
61
|
+
let bestI = 0, bestJ = 1, bestW = -1;
|
|
62
|
+
for (let i = 0; i < clusters.length; i++) {
|
|
63
|
+
for (let j = i + 1; j < clusters.length; j++) {
|
|
64
|
+
const w = clusterWeight(clusters[i], clusters[j], weights);
|
|
65
|
+
const norm = w / (clusters[i].length + clusters[j].length);
|
|
66
|
+
if (norm > bestW) {
|
|
67
|
+
bestW = norm;
|
|
68
|
+
bestI = i;
|
|
69
|
+
bestJ = j;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
if (bestW <= 0) break;
|
|
74
|
+
const merged = [...clusters[bestI], ...clusters[bestJ]];
|
|
75
|
+
clusters.splice(bestJ, 1);
|
|
76
|
+
clusters.splice(bestI, 1);
|
|
77
|
+
clusters.push(merged);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Name each cluster from discriminative tokens.
|
|
81
|
+
const children = clusters.map((group) => {
|
|
82
|
+
const groupModules = group.map((i) => modules[i]);
|
|
83
|
+
const name = inferGroupName(group, moduleTokens, modules);
|
|
84
|
+
return {
|
|
85
|
+
name,
|
|
86
|
+
path: `src/${name}`,
|
|
87
|
+
modules: groupModules,
|
|
88
|
+
children: [],
|
|
89
|
+
depth: 1,
|
|
90
|
+
};
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
return {
|
|
94
|
+
name: 'src',
|
|
95
|
+
path: 'src',
|
|
96
|
+
modules: [],
|
|
97
|
+
children,
|
|
98
|
+
depth: 0,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/** Compute total shared-token weight between two clusters. */
|
|
103
|
+
function clusterWeight(a, b, weights) {
|
|
104
|
+
let total = 0;
|
|
105
|
+
for (const ai of a) {
|
|
106
|
+
for (const bi of b) {
|
|
107
|
+
const key = ai < bi ? `${ai}:${bi}` : `${bi}:${ai}`;
|
|
108
|
+
total += weights.get(key) || 0;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
return total;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/** Infer a group name from discriminative tokens. */
|
|
115
|
+
function inferGroupName(group, moduleTokens, modules) {
|
|
116
|
+
const freq = new Map();
|
|
117
|
+
for (const i of group) {
|
|
118
|
+
for (const tok of moduleTokens[i]) {
|
|
119
|
+
freq.set(tok, (freq.get(tok) || 0) + 1);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
const globalFreq = new Map();
|
|
123
|
+
for (const tokens of moduleTokens) {
|
|
124
|
+
for (const tok of tokens) {
|
|
125
|
+
globalFreq.set(tok, (globalFreq.get(tok) || 0) + 1);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
let best = null, bestScore = -1;
|
|
129
|
+
for (const [tok, count] of freq) {
|
|
130
|
+
const global = globalFreq.get(tok) || 0;
|
|
131
|
+
const score = (count / (global + 1)) * Math.log(count + 1);
|
|
132
|
+
if (score > bestScore && tok.length >= 3) {
|
|
133
|
+
bestScore = score;
|
|
134
|
+
best = tok;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
if (best) return best.toLowerCase().replace(/[^a-z0-9_-]/g, '_');
|
|
138
|
+
if (group.length > 0) return modules[group[0]].name;
|
|
139
|
+
return 'group';
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
module.exports = { buildModuleTree };
|