graphifyy 0.3.17 → 0.3.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ja-JP.md +60 -17
- package/README.md +41 -13
- package/README.zh-CN.md +54 -17
- package/dist/cli.js +862 -369
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1070 -598
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +36 -6
- package/dist/index.d.ts +36 -6
- package/dist/index.js +1092 -614
- package/dist/index.js.map +1 -1
- package/dist/skill-runtime.js +1182 -669
- package/dist/skill-runtime.js.map +1 -1
- package/package.json +14 -4
- package/src/skills/skill-claw.md +1 -0
- package/src/skills/skill-codex.md +69 -11
- package/src/skills/skill-droid.md +73 -6
- package/src/skills/skill-gemini.toml +207 -0
- package/src/skills/skill-opencode.md +73 -6
- package/src/skills/skill-trae.md +1 -0
- package/src/skills/skill-windows.md +76 -5
- package/src/skills/skill.md +82 -8
package/dist/skill-runtime.js
CHANGED
|
@@ -7,14 +7,13 @@ var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require
|
|
|
7
7
|
|
|
8
8
|
// src/skill-runtime.ts
|
|
9
9
|
import { Command } from "commander";
|
|
10
|
-
import Graph3 from "graphology";
|
|
11
10
|
import {
|
|
12
|
-
existsSync as
|
|
13
|
-
mkdirSync as
|
|
11
|
+
existsSync as existsSync8,
|
|
12
|
+
mkdirSync as mkdirSync5,
|
|
14
13
|
readFileSync as readFileSync5,
|
|
15
|
-
writeFileSync as
|
|
14
|
+
writeFileSync as writeFileSync6
|
|
16
15
|
} from "fs";
|
|
17
|
-
import { dirname as
|
|
16
|
+
import { dirname as dirname4, join as join5, resolve as resolve7 } from "path";
|
|
18
17
|
import { fileURLToPath } from "url";
|
|
19
18
|
|
|
20
19
|
// src/analyze.ts
|
|
@@ -36,10 +35,75 @@ function toNumericMap(value) {
|
|
|
36
35
|
|
|
37
36
|
// src/cluster.ts
|
|
38
37
|
import louvain from "graphology-communities-louvain";
|
|
38
|
+
|
|
39
|
+
// src/graph.ts
|
|
40
|
+
import Graph from "graphology";
|
|
41
|
+
function createGraph(directed = false) {
|
|
42
|
+
return new Graph({ type: directed ? "directed" : "undirected", multi: false });
|
|
43
|
+
}
|
|
44
|
+
function isDirectedGraph(G) {
|
|
45
|
+
return G.type === "directed";
|
|
46
|
+
}
|
|
47
|
+
function loadGraphFromData(raw) {
|
|
48
|
+
const G = createGraph(raw.directed === true);
|
|
49
|
+
for (const [key, value] of Object.entries(raw.graph ?? {})) {
|
|
50
|
+
G.setAttribute(key, value);
|
|
51
|
+
}
|
|
52
|
+
for (const node of raw.nodes ?? []) {
|
|
53
|
+
const { id, ...attrs } = node;
|
|
54
|
+
G.mergeNode(id, attrs);
|
|
55
|
+
}
|
|
56
|
+
for (const link of raw.links ?? raw.edges ?? []) {
|
|
57
|
+
const { source, target, ...attrs } = link;
|
|
58
|
+
if (!G.hasNode(source) || !G.hasNode(target)) continue;
|
|
59
|
+
try {
|
|
60
|
+
G.mergeEdge(source, target, attrs);
|
|
61
|
+
} catch {
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
if (raw.hyperedges && raw.hyperedges.length > 0) {
|
|
65
|
+
G.setAttribute("hyperedges", raw.hyperedges);
|
|
66
|
+
}
|
|
67
|
+
return G;
|
|
68
|
+
}
|
|
69
|
+
function toUndirectedGraph(G) {
|
|
70
|
+
if (!isDirectedGraph(G)) return G.copy();
|
|
71
|
+
const copy = createGraph(false);
|
|
72
|
+
for (const [key, value] of Object.entries(G.getAttributes())) {
|
|
73
|
+
copy.setAttribute(key, value);
|
|
74
|
+
}
|
|
75
|
+
G.forEachNode((nodeId, attrs) => {
|
|
76
|
+
copy.mergeNode(nodeId, attrs);
|
|
77
|
+
});
|
|
78
|
+
G.forEachEdge((_edge, attrs, source, target) => {
|
|
79
|
+
if (!copy.hasNode(source) || !copy.hasNode(target)) return;
|
|
80
|
+
try {
|
|
81
|
+
copy.mergeEdge(source, target, attrs);
|
|
82
|
+
} catch {
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
return copy;
|
|
86
|
+
}
|
|
87
|
+
function forEachTraversalNeighbor(G, node, callback) {
|
|
88
|
+
if (isDirectedGraph(G)) {
|
|
89
|
+
G.forEachOutboundNeighbor(node, callback);
|
|
90
|
+
return;
|
|
91
|
+
}
|
|
92
|
+
G.forEachNeighbor(node, callback);
|
|
93
|
+
}
|
|
94
|
+
function traversalNeighbors(G, node) {
|
|
95
|
+
const neighbors = [];
|
|
96
|
+
forEachTraversalNeighbor(G, node, (neighbor) => {
|
|
97
|
+
neighbors.push(neighbor);
|
|
98
|
+
});
|
|
99
|
+
return neighbors;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// src/cluster.ts
|
|
39
103
|
var MAX_COMMUNITY_FRACTION = 0.25;
|
|
40
104
|
var MIN_SPLIT_SIZE = 10;
|
|
41
105
|
function partition(G) {
|
|
42
|
-
const result = louvain(G);
|
|
106
|
+
const result = louvain(G.type === "directed" ? toUndirectedGraph(G) : G);
|
|
43
107
|
const map = /* @__PURE__ */ new Map();
|
|
44
108
|
for (const [node, cid] of Object.entries(result)) {
|
|
45
109
|
map.set(node, cid);
|
|
@@ -142,93 +206,435 @@ function scoreAll(G, communities) {
|
|
|
142
206
|
return result;
|
|
143
207
|
}
|
|
144
208
|
|
|
145
|
-
// src/
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
209
|
+
// src/detect.ts
|
|
210
|
+
import {
|
|
211
|
+
readdirSync,
|
|
212
|
+
readFileSync,
|
|
213
|
+
writeFileSync,
|
|
214
|
+
statSync,
|
|
215
|
+
existsSync,
|
|
216
|
+
mkdirSync,
|
|
217
|
+
lstatSync
|
|
218
|
+
} from "fs";
|
|
219
|
+
import { join, resolve, extname, basename, relative, sep, dirname } from "path";
|
|
220
|
+
import { createHash } from "crypto";
|
|
221
|
+
var MANIFEST_PATH = "graphify-out/manifest.json";
|
|
222
|
+
var CODE_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
223
|
+
".py",
|
|
224
|
+
".ts",
|
|
225
|
+
".js",
|
|
226
|
+
".jsx",
|
|
227
|
+
".tsx",
|
|
228
|
+
".go",
|
|
229
|
+
".rs",
|
|
230
|
+
".java",
|
|
231
|
+
".cpp",
|
|
232
|
+
".cc",
|
|
233
|
+
".cxx",
|
|
234
|
+
".c",
|
|
235
|
+
".h",
|
|
236
|
+
".hpp",
|
|
237
|
+
".rb",
|
|
238
|
+
".swift",
|
|
239
|
+
".kt",
|
|
240
|
+
".kts",
|
|
241
|
+
".cs",
|
|
242
|
+
".scala",
|
|
243
|
+
".php",
|
|
244
|
+
".lua",
|
|
245
|
+
".toc",
|
|
246
|
+
".zig",
|
|
247
|
+
".ps1",
|
|
248
|
+
".ex",
|
|
249
|
+
".exs",
|
|
250
|
+
".m",
|
|
251
|
+
".mm",
|
|
252
|
+
".jl"
|
|
253
|
+
]);
|
|
254
|
+
var DOC_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt", ".rst"]);
|
|
255
|
+
var PAPER_EXTENSIONS = /* @__PURE__ */ new Set([".pdf"]);
|
|
256
|
+
var IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg"]);
|
|
257
|
+
var OFFICE_EXTENSIONS = /* @__PURE__ */ new Set([".docx", ".xlsx"]);
|
|
258
|
+
var VIDEO_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
259
|
+
".mp4",
|
|
260
|
+
".mov",
|
|
261
|
+
".webm",
|
|
262
|
+
".mkv",
|
|
263
|
+
".avi",
|
|
264
|
+
".m4v",
|
|
265
|
+
".mp3",
|
|
266
|
+
".wav",
|
|
267
|
+
".m4a",
|
|
268
|
+
".ogg"
|
|
269
|
+
]);
|
|
270
|
+
var CORPUS_WARN_THRESHOLD = 5e4;
|
|
271
|
+
var CORPUS_UPPER_THRESHOLD = 5e5;
|
|
272
|
+
var FILE_COUNT_UPPER = 200;
|
|
273
|
+
var SENSITIVE_PATTERNS = [
|
|
274
|
+
/(^|[\\/])\.(env|envrc)(\.|$)/i,
|
|
275
|
+
/\.(pem|key|p12|pfx|cert|crt|der|p8)$/i,
|
|
276
|
+
/(credential|secret|passwd|password|token|private_key)/i,
|
|
277
|
+
/(id_rsa|id_dsa|id_ecdsa|id_ed25519)(\.pub)?$/,
|
|
278
|
+
/(\.netrc|\.pgpass|\.htpasswd)$/i,
|
|
279
|
+
/(aws_credentials|gcloud_credentials|service.account)/i
|
|
280
|
+
];
|
|
281
|
+
var PAPER_SIGNALS = [
|
|
282
|
+
/\barxiv\b/i,
|
|
283
|
+
/\bdoi\s*:/i,
|
|
284
|
+
/\babstract\b/i,
|
|
285
|
+
/\bproceedings\b/i,
|
|
286
|
+
/\bjournal\b/i,
|
|
287
|
+
/\bpreprint\b/i,
|
|
288
|
+
/\\cite\{/,
|
|
289
|
+
/\[\d+\]/,
|
|
290
|
+
/\[\n\d+\n\]/,
|
|
291
|
+
/eq\.\s*\d+|equation\s+\d+/i,
|
|
292
|
+
/\d{4}\.\d{4,5}/,
|
|
293
|
+
/\bwe propose\b/i,
|
|
294
|
+
/\bliterature\b/i
|
|
295
|
+
];
|
|
296
|
+
var PAPER_SIGNAL_THRESHOLD = 3;
|
|
297
|
+
function isSensitive(filePath) {
|
|
298
|
+
const name = basename(filePath);
|
|
299
|
+
return SENSITIVE_PATTERNS.some((p) => p.test(name) || p.test(filePath));
|
|
300
|
+
}
|
|
301
|
+
function looksLikePaper(filePath) {
|
|
302
|
+
try {
|
|
303
|
+
const text = readFileSync(filePath, "utf-8").slice(0, 3e3);
|
|
304
|
+
const hits = PAPER_SIGNALS.filter((p) => p.test(text)).length;
|
|
305
|
+
return hits >= PAPER_SIGNAL_THRESHOLD;
|
|
306
|
+
} catch {
|
|
307
|
+
return false;
|
|
151
308
|
}
|
|
152
|
-
return result;
|
|
153
309
|
}
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
const
|
|
157
|
-
if (
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
310
|
+
var ASSET_DIR_MARKERS = /* @__PURE__ */ new Set([".imageset", ".xcassets", ".appiconset", ".colorset", ".launchimage"]);
|
|
311
|
+
function classifyFile(filePath) {
|
|
312
|
+
const ext = extname(filePath).toLowerCase();
|
|
313
|
+
if (CODE_EXTENSIONS.has(ext)) return "code" /* CODE */;
|
|
314
|
+
if (PAPER_EXTENSIONS.has(ext)) {
|
|
315
|
+
const parts = filePath.split(sep);
|
|
316
|
+
if (parts.some((p) => [...ASSET_DIR_MARKERS].some((m) => p.endsWith(m)))) return null;
|
|
317
|
+
return "paper" /* PAPER */;
|
|
162
318
|
}
|
|
163
|
-
if (
|
|
164
|
-
if (
|
|
165
|
-
|
|
319
|
+
if (IMAGE_EXTENSIONS.has(ext)) return "image" /* IMAGE */;
|
|
320
|
+
if (VIDEO_EXTENSIONS.has(ext)) return "video" /* VIDEO */;
|
|
321
|
+
if (DOC_EXTENSIONS.has(ext)) {
|
|
322
|
+
if (looksLikePaper(filePath)) return "paper" /* PAPER */;
|
|
323
|
+
return "document" /* DOCUMENT */;
|
|
324
|
+
}
|
|
325
|
+
if (OFFICE_EXTENSIONS.has(ext)) return "document" /* DOCUMENT */;
|
|
326
|
+
return null;
|
|
166
327
|
}
|
|
167
|
-
function
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
328
|
+
function countWords(filePath) {
|
|
329
|
+
try {
|
|
330
|
+
const text = readFileSync(filePath, "utf-8");
|
|
331
|
+
return text.split(/\s+/).filter(Boolean).length;
|
|
332
|
+
} catch {
|
|
333
|
+
return 0;
|
|
334
|
+
}
|
|
174
335
|
}
|
|
175
|
-
var
|
|
176
|
-
"
|
|
177
|
-
"
|
|
178
|
-
"
|
|
179
|
-
"
|
|
180
|
-
"
|
|
181
|
-
"
|
|
182
|
-
"
|
|
183
|
-
"
|
|
184
|
-
"
|
|
185
|
-
"
|
|
186
|
-
"
|
|
187
|
-
"
|
|
188
|
-
"
|
|
189
|
-
"
|
|
190
|
-
"
|
|
336
|
+
var SKIP_DIRS = /* @__PURE__ */ new Set([
|
|
337
|
+
"venv",
|
|
338
|
+
".venv",
|
|
339
|
+
"env",
|
|
340
|
+
".env",
|
|
341
|
+
"node_modules",
|
|
342
|
+
"__pycache__",
|
|
343
|
+
".git",
|
|
344
|
+
"dist",
|
|
345
|
+
"build",
|
|
346
|
+
"target",
|
|
347
|
+
"out",
|
|
348
|
+
"site-packages",
|
|
349
|
+
"lib64",
|
|
350
|
+
".pytest_cache",
|
|
351
|
+
".mypy_cache",
|
|
352
|
+
".ruff_cache",
|
|
353
|
+
".tox",
|
|
354
|
+
".eggs"
|
|
191
355
|
]);
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
if (PAPER_EXTENSIONS.has(ext)) return "paper";
|
|
198
|
-
if (IMAGE_EXTENSIONS.has(ext)) return "image";
|
|
199
|
-
return "doc";
|
|
200
|
-
}
|
|
201
|
-
function topLevelDir(path) {
|
|
202
|
-
return path.includes("/") ? path.split("/")[0] : path;
|
|
356
|
+
function isNoiseDir(part) {
|
|
357
|
+
if (SKIP_DIRS.has(part)) return true;
|
|
358
|
+
if (part.endsWith("_venv") || part.endsWith("_env")) return true;
|
|
359
|
+
if (part.endsWith(".egg-info")) return true;
|
|
360
|
+
return false;
|
|
203
361
|
}
|
|
204
|
-
function
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
362
|
+
function loadGraphifyignore(root) {
|
|
363
|
+
const patterns = [];
|
|
364
|
+
let current = resolve(root);
|
|
365
|
+
while (true) {
|
|
366
|
+
const ignoreFile = join(current, ".graphifyignore");
|
|
367
|
+
if (existsSync(ignoreFile)) {
|
|
368
|
+
for (let line of readFileSync(ignoreFile, "utf-8").split("\n")) {
|
|
369
|
+
line = line.trim();
|
|
370
|
+
if (line && !line.startsWith("#")) {
|
|
371
|
+
patterns.push(line);
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
if (existsSync(join(current, ".git"))) {
|
|
376
|
+
break;
|
|
377
|
+
}
|
|
378
|
+
const parent = dirname(current);
|
|
379
|
+
if (parent === current) {
|
|
380
|
+
break;
|
|
381
|
+
}
|
|
382
|
+
current = parent;
|
|
218
383
|
}
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
384
|
+
return patterns;
|
|
385
|
+
}
|
|
386
|
+
function matchGlob(text, pattern) {
|
|
387
|
+
const regex = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
388
|
+
return new RegExp(`^${regex}$`).test(text);
|
|
389
|
+
}
|
|
390
|
+
function isIgnored(filePath, root, patterns) {
|
|
391
|
+
if (patterns.length === 0) return false;
|
|
392
|
+
let rel;
|
|
393
|
+
try {
|
|
394
|
+
rel = relative(root, filePath).replace(/\\/g, "/");
|
|
395
|
+
} catch {
|
|
396
|
+
return false;
|
|
222
397
|
}
|
|
223
|
-
const
|
|
224
|
-
const
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
398
|
+
const parts = rel.split("/");
|
|
399
|
+
for (const pattern of patterns) {
|
|
400
|
+
const p = pattern.replace(/^\/+|\/+$/g, "");
|
|
401
|
+
if (!p) continue;
|
|
402
|
+
if (matchGlob(rel, p)) return true;
|
|
403
|
+
if (matchGlob(basename(filePath), p)) return true;
|
|
404
|
+
for (let i = 0; i < parts.length; i++) {
|
|
405
|
+
if (matchGlob(parts[i], p)) return true;
|
|
406
|
+
if (matchGlob(parts.slice(0, i + 1).join("/"), p)) return true;
|
|
407
|
+
}
|
|
228
408
|
}
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
409
|
+
return false;
|
|
410
|
+
}
|
|
411
|
+
function walkDir(dir, root, ignorePatterns, followSymlinks, skipPrune) {
|
|
412
|
+
const result = [];
|
|
413
|
+
let entries;
|
|
414
|
+
try {
|
|
415
|
+
entries = readdirSync(dir);
|
|
416
|
+
} catch {
|
|
417
|
+
return result;
|
|
418
|
+
}
|
|
419
|
+
for (const entry of entries) {
|
|
420
|
+
const full = join(dir, entry);
|
|
421
|
+
let stat;
|
|
422
|
+
try {
|
|
423
|
+
stat = followSymlinks ? statSync(full) : lstatSync(full);
|
|
424
|
+
} catch {
|
|
425
|
+
continue;
|
|
426
|
+
}
|
|
427
|
+
if (stat.isDirectory()) {
|
|
428
|
+
if (!skipPrune) {
|
|
429
|
+
if (entry.startsWith(".")) continue;
|
|
430
|
+
if (isNoiseDir(entry)) continue;
|
|
431
|
+
if (isIgnored(full, root, ignorePatterns)) continue;
|
|
432
|
+
}
|
|
433
|
+
result.push(...walkDir(full, root, ignorePatterns, followSymlinks, skipPrune));
|
|
434
|
+
} else if (stat.isFile()) {
|
|
435
|
+
result.push(full);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
return result;
|
|
439
|
+
}
|
|
440
|
+
function detect(root, options) {
|
|
441
|
+
const followSymlinks = options?.followSymlinks ?? false;
|
|
442
|
+
const rootResolved = resolve(root);
|
|
443
|
+
const ignorePatterns = loadGraphifyignore(rootResolved);
|
|
444
|
+
const convertedDir = join(rootResolved, "graphify-out", "converted");
|
|
445
|
+
const memoryDir = join(rootResolved, "graphify-out", "memory");
|
|
446
|
+
const files = {
|
|
447
|
+
code: [],
|
|
448
|
+
document: [],
|
|
449
|
+
paper: [],
|
|
450
|
+
image: [],
|
|
451
|
+
video: []
|
|
452
|
+
};
|
|
453
|
+
let totalWords = 0;
|
|
454
|
+
const skippedSensitive = [];
|
|
455
|
+
const allFiles = walkDir(rootResolved, rootResolved, ignorePatterns, followSymlinks, false);
|
|
456
|
+
if (existsSync(memoryDir)) {
|
|
457
|
+
allFiles.push(...walkDir(memoryDir, rootResolved, ignorePatterns, followSymlinks, true));
|
|
458
|
+
}
|
|
459
|
+
const seen = /* @__PURE__ */ new Set();
|
|
460
|
+
for (const p of allFiles) {
|
|
461
|
+
if (seen.has(p)) continue;
|
|
462
|
+
seen.add(p);
|
|
463
|
+
const inMemory = existsSync(memoryDir) && p.startsWith(memoryDir);
|
|
464
|
+
if (!inMemory) {
|
|
465
|
+
if (basename(p).startsWith(".")) continue;
|
|
466
|
+
if (p.startsWith(convertedDir)) continue;
|
|
467
|
+
}
|
|
468
|
+
if (isIgnored(p, rootResolved, ignorePatterns)) continue;
|
|
469
|
+
if (isSensitive(p)) {
|
|
470
|
+
skippedSensitive.push(p);
|
|
471
|
+
continue;
|
|
472
|
+
}
|
|
473
|
+
const ftype = classifyFile(p);
|
|
474
|
+
if (!ftype) continue;
|
|
475
|
+
if (OFFICE_EXTENSIONS.has(extname(p).toLowerCase())) {
|
|
476
|
+
skippedSensitive.push(p + " [office conversion requires async - use pipeline]");
|
|
477
|
+
continue;
|
|
478
|
+
}
|
|
479
|
+
files[ftype].push(p);
|
|
480
|
+
if (ftype !== "video" /* VIDEO */) {
|
|
481
|
+
totalWords += countWords(p);
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
const totalFiles = Object.values(files).reduce((s, v) => s + v.length, 0);
|
|
485
|
+
const needsGraph = totalWords >= CORPUS_WARN_THRESHOLD;
|
|
486
|
+
let warning = null;
|
|
487
|
+
if (!needsGraph) {
|
|
488
|
+
warning = `Corpus is ~${totalWords.toLocaleString()} words - fits in a single context window. You may not need a graph.`;
|
|
489
|
+
} else if (totalWords >= CORPUS_UPPER_THRESHOLD || totalFiles >= FILE_COUNT_UPPER) {
|
|
490
|
+
warning = `Large corpus: ${totalFiles} files \xB7 ~${totalWords.toLocaleString()} words. Semantic extraction will be expensive (many Claude tokens). Consider running on a subfolder, or use --no-semantic to run AST-only.`;
|
|
491
|
+
}
|
|
492
|
+
return {
|
|
493
|
+
files,
|
|
494
|
+
total_files: totalFiles,
|
|
495
|
+
total_words: totalWords,
|
|
496
|
+
needs_graph: needsGraph,
|
|
497
|
+
warning,
|
|
498
|
+
skipped_sensitive: skippedSensitive,
|
|
499
|
+
graphifyignore_patterns: ignorePatterns.length
|
|
500
|
+
};
|
|
501
|
+
}
|
|
502
|
+
function loadManifest(manifestPath = MANIFEST_PATH) {
|
|
503
|
+
try {
|
|
504
|
+
return JSON.parse(readFileSync(manifestPath, "utf-8"));
|
|
505
|
+
} catch {
|
|
506
|
+
return {};
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
function saveManifest(files, manifestPath = MANIFEST_PATH) {
|
|
510
|
+
const manifest = {};
|
|
511
|
+
for (const fileList of Object.values(files)) {
|
|
512
|
+
for (const f of fileList) {
|
|
513
|
+
try {
|
|
514
|
+
manifest[f] = statSync(f).mtimeMs;
|
|
515
|
+
} catch {
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
const dir = join(manifestPath, "..");
|
|
520
|
+
mkdirSync(dir, { recursive: true });
|
|
521
|
+
writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
|
|
522
|
+
}
|
|
523
|
+
function detectIncremental(root, manifestPath = MANIFEST_PATH) {
|
|
524
|
+
const full = detect(root);
|
|
525
|
+
const manifest = loadManifest(manifestPath);
|
|
526
|
+
if (Object.keys(manifest).length === 0) {
|
|
527
|
+
return {
|
|
528
|
+
...full,
|
|
529
|
+
incremental: true,
|
|
530
|
+
new_files: full.files,
|
|
531
|
+
unchanged_files: Object.fromEntries(Object.keys(full.files).map((k) => [k, []])),
|
|
532
|
+
new_total: full.total_files
|
|
533
|
+
};
|
|
534
|
+
}
|
|
535
|
+
const newFiles = {};
|
|
536
|
+
const unchangedFiles = {};
|
|
537
|
+
for (const k of Object.keys(full.files)) {
|
|
538
|
+
newFiles[k] = [];
|
|
539
|
+
unchangedFiles[k] = [];
|
|
540
|
+
}
|
|
541
|
+
for (const [ftype, fileList] of Object.entries(full.files)) {
|
|
542
|
+
for (const f of fileList) {
|
|
543
|
+
const storedMtime = manifest[f];
|
|
544
|
+
let currentMtime = 0;
|
|
545
|
+
try {
|
|
546
|
+
currentMtime = statSync(f).mtimeMs;
|
|
547
|
+
} catch {
|
|
548
|
+
}
|
|
549
|
+
if (storedMtime === void 0 || currentMtime > storedMtime) {
|
|
550
|
+
newFiles[ftype].push(f);
|
|
551
|
+
} else {
|
|
552
|
+
unchangedFiles[ftype].push(f);
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
const currentFiles = new Set(Object.values(full.files).flat());
|
|
557
|
+
const deletedFiles = Object.keys(manifest).filter((f) => !currentFiles.has(f));
|
|
558
|
+
const newTotal = Object.values(newFiles).reduce((s, v) => s + v.length, 0);
|
|
559
|
+
return {
|
|
560
|
+
...full,
|
|
561
|
+
incremental: true,
|
|
562
|
+
new_files: newFiles,
|
|
563
|
+
unchanged_files: unchangedFiles,
|
|
564
|
+
new_total: newTotal,
|
|
565
|
+
deleted_files: deletedFiles
|
|
566
|
+
};
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
// src/analyze.ts
|
|
570
|
+
function nodeCommunityMap(communities) {
|
|
571
|
+
const communityMap = toNumericMap(communities);
|
|
572
|
+
const result = /* @__PURE__ */ new Map();
|
|
573
|
+
for (const [cid, nodes] of communityMap) {
|
|
574
|
+
for (const n of nodes) result.set(n, cid);
|
|
575
|
+
}
|
|
576
|
+
return result;
|
|
577
|
+
}
|
|
578
|
+
function isFileNode(G, nodeId) {
|
|
579
|
+
const attrs = G.getNodeAttributes(nodeId);
|
|
580
|
+
const label = attrs.label ?? "";
|
|
581
|
+
if (!label) return false;
|
|
582
|
+
const sourceFile = attrs.source_file ?? "";
|
|
583
|
+
if (sourceFile) {
|
|
584
|
+
const fileName = sourceFile.split("/").pop() ?? "";
|
|
585
|
+
if (label === fileName) return true;
|
|
586
|
+
}
|
|
587
|
+
if (label.startsWith(".") && label.endsWith("()")) return true;
|
|
588
|
+
if (label.endsWith("()") && G.degree(nodeId) <= 1) return true;
|
|
589
|
+
return false;
|
|
590
|
+
}
|
|
591
|
+
function isConceptNode(G, nodeId) {
|
|
592
|
+
const data = G.getNodeAttributes(nodeId);
|
|
593
|
+
const source = data.source_file ?? "";
|
|
594
|
+
if (!source) return true;
|
|
595
|
+
const lastPart = source.split("/").pop() ?? "";
|
|
596
|
+
if (!lastPart.includes(".")) return true;
|
|
597
|
+
return false;
|
|
598
|
+
}
|
|
599
|
+
function fileCategory(path) {
|
|
600
|
+
const ext = path.includes(".") ? `.${path.split(".").pop()?.toLowerCase() ?? ""}` : "";
|
|
601
|
+
if (CODE_EXTENSIONS.has(ext)) return "code";
|
|
602
|
+
if (PAPER_EXTENSIONS.has(ext)) return "paper";
|
|
603
|
+
if (IMAGE_EXTENSIONS.has(ext)) return "image";
|
|
604
|
+
if (DOC_EXTENSIONS.has(ext)) return "doc";
|
|
605
|
+
return "doc";
|
|
606
|
+
}
|
|
607
|
+
function topLevelDir(path) {
|
|
608
|
+
return path.includes("/") ? path.split("/")[0] : path;
|
|
609
|
+
}
|
|
610
|
+
function surpriseScore(G, u, v, data, nodeCommunity, uSource, vSource) {
|
|
611
|
+
let score = 0;
|
|
612
|
+
const reasons = [];
|
|
613
|
+
const conf = data.confidence ?? "EXTRACTED";
|
|
614
|
+
const confBonus = { AMBIGUOUS: 3, INFERRED: 2, EXTRACTED: 1 };
|
|
615
|
+
score += confBonus[conf] ?? 1;
|
|
616
|
+
if (conf === "AMBIGUOUS" || conf === "INFERRED") {
|
|
617
|
+
reasons.push(`${conf.toLowerCase()} connection - not explicitly stated in source`);
|
|
618
|
+
}
|
|
619
|
+
const catU = fileCategory(uSource);
|
|
620
|
+
const catV = fileCategory(vSource);
|
|
621
|
+
if (catU !== catV) {
|
|
622
|
+
score += 2;
|
|
623
|
+
reasons.push(`crosses file types (${catU} \u2194 ${catV})`);
|
|
624
|
+
}
|
|
625
|
+
if (topLevelDir(uSource) !== topLevelDir(vSource)) {
|
|
626
|
+
score += 2;
|
|
627
|
+
reasons.push("connects across different repos/directories");
|
|
628
|
+
}
|
|
629
|
+
const cidU = nodeCommunity.get(u);
|
|
630
|
+
const cidV = nodeCommunity.get(v);
|
|
631
|
+
if (cidU !== void 0 && cidV !== void 0 && cidU !== cidV) {
|
|
632
|
+
score += 1;
|
|
633
|
+
reasons.push("bridges separate communities");
|
|
634
|
+
}
|
|
635
|
+
if (data.relation === "semantically_similar_to") {
|
|
636
|
+
score = Math.floor(score * 1.5);
|
|
637
|
+
reasons.push("semantically similar concepts with no structural link");
|
|
232
638
|
}
|
|
233
639
|
const degU = G.degree(u);
|
|
234
640
|
const degV = G.degree(v);
|
|
@@ -389,10 +795,10 @@ function suggestQuestions(G, communities, communityLabels, topN = 7) {
|
|
|
389
795
|
const cid = nodeCommunity.get(nodeId);
|
|
390
796
|
const commLabel = cid !== void 0 ? labelMap.get(cid) ?? `Community ${cid}` : "unknown";
|
|
391
797
|
const neighborComms = /* @__PURE__ */ new Set();
|
|
392
|
-
G
|
|
798
|
+
for (const n of traversalNeighbors(G, nodeId)) {
|
|
393
799
|
const nc = nodeCommunity.get(n);
|
|
394
800
|
if (nc !== void 0 && nc !== cid) neighborComms.add(nc);
|
|
395
|
-
}
|
|
801
|
+
}
|
|
396
802
|
if (neighborComms.size > 0) {
|
|
397
803
|
const otherLabels = [...neighborComms].map((c) => labelMap.get(c) ?? `Community ${c}`);
|
|
398
804
|
questions.push({
|
|
@@ -523,8 +929,7 @@ function graphDiff(GOld, GNew) {
|
|
|
523
929
|
}
|
|
524
930
|
|
|
525
931
|
// src/benchmark.ts
|
|
526
|
-
import { readFileSync, existsSync } from "fs";
|
|
527
|
-
import Graph from "graphology";
|
|
932
|
+
import { readFileSync as readFileSync2, existsSync as existsSync2 } from "fs";
|
|
528
933
|
var CHARS_PER_TOKEN = 4;
|
|
529
934
|
function estimateTokens(text) {
|
|
530
935
|
return Math.max(1, Math.floor(text.length / CHARS_PER_TOKEN));
|
|
@@ -546,7 +951,7 @@ function querySubgraphTokens(G, question, depth = 3) {
|
|
|
546
951
|
for (let d = 0; d < depth; d++) {
|
|
547
952
|
const nextFrontier = /* @__PURE__ */ new Set();
|
|
548
953
|
for (const n of frontier) {
|
|
549
|
-
G
|
|
954
|
+
forEachTraversalNeighbor(G, n, (neighbor) => {
|
|
550
955
|
if (!visited.has(neighbor)) {
|
|
551
956
|
nextFrontier.add(neighbor);
|
|
552
957
|
edgesSeen.push([n, neighbor]);
|
|
@@ -580,26 +985,12 @@ var SAMPLE_QUESTIONS = [
|
|
|
580
985
|
"what are the core abstractions"
|
|
581
986
|
];
|
|
582
987
|
function loadGraph(graphPath) {
|
|
583
|
-
const raw = JSON.parse(
|
|
584
|
-
|
|
585
|
-
for (const node of raw.nodes ?? []) {
|
|
586
|
-
const { id, ...attrs } = node;
|
|
587
|
-
G.mergeNode(id, attrs);
|
|
588
|
-
}
|
|
589
|
-
for (const link of raw.links ?? []) {
|
|
590
|
-
const { source, target, ...attrs } = link;
|
|
591
|
-
if (G.hasNode(source) && G.hasNode(target)) {
|
|
592
|
-
try {
|
|
593
|
-
G.mergeEdge(source, target, attrs);
|
|
594
|
-
} catch {
|
|
595
|
-
}
|
|
596
|
-
}
|
|
597
|
-
}
|
|
598
|
-
return G;
|
|
988
|
+
const raw = JSON.parse(readFileSync2(graphPath, "utf-8"));
|
|
989
|
+
return loadGraphFromData(raw);
|
|
599
990
|
}
|
|
600
991
|
function runBenchmark(graphPath = "graphify-out/graph.json", corpusWordsOrOptions, questions) {
|
|
601
992
|
const options = typeof corpusWordsOrOptions === "number" ? { corpusWords: corpusWordsOrOptions, questions } : corpusWordsOrOptions ?? {};
|
|
602
|
-
if (!
|
|
993
|
+
if (!existsSync2(graphPath)) {
|
|
603
994
|
return { error: `Graph file not found: ${graphPath}. Build the graph first.` };
|
|
604
995
|
}
|
|
605
996
|
const G = loadGraph(graphPath);
|
|
@@ -658,21 +1049,33 @@ graphify token reduction benchmark`);
|
|
|
658
1049
|
}
|
|
659
1050
|
|
|
660
1051
|
// src/cache.ts
|
|
661
|
-
import { createHash } from "crypto";
|
|
662
|
-
import { readFileSync as
|
|
663
|
-
import { join, resolve } from "path";
|
|
1052
|
+
import { createHash as createHash2 } from "crypto";
|
|
1053
|
+
import { readFileSync as readFileSync3, writeFileSync as writeFileSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2, unlinkSync, renameSync, existsSync as existsSync3 } from "fs";
|
|
1054
|
+
import { extname as extname2, join as join2, resolve as resolve2 } from "path";
|
|
1055
|
+
function bodyContent(content) {
|
|
1056
|
+
const text = content.toString("utf-8");
|
|
1057
|
+
if (!text.startsWith("---")) {
|
|
1058
|
+
return content;
|
|
1059
|
+
}
|
|
1060
|
+
const end = text.indexOf("\n---", 3);
|
|
1061
|
+
if (end === -1) {
|
|
1062
|
+
return content;
|
|
1063
|
+
}
|
|
1064
|
+
return Buffer.from(text.slice(end + 4), "utf-8");
|
|
1065
|
+
}
|
|
664
1066
|
function fileHash(filePath) {
|
|
665
|
-
const
|
|
666
|
-
const
|
|
667
|
-
const
|
|
1067
|
+
const raw = readFileSync3(filePath);
|
|
1068
|
+
const content = extname2(filePath).toLowerCase() === ".md" ? bodyContent(raw) : raw;
|
|
1069
|
+
const resolved = resolve2(filePath);
|
|
1070
|
+
const h = createHash2("sha256");
|
|
668
1071
|
h.update(content);
|
|
669
1072
|
h.update("\0");
|
|
670
1073
|
h.update(resolved);
|
|
671
1074
|
return h.digest("hex");
|
|
672
1075
|
}
|
|
673
1076
|
function cacheDir(root = ".") {
|
|
674
|
-
const d =
|
|
675
|
-
|
|
1077
|
+
const d = join2(root, "graphify-out", "cache");
|
|
1078
|
+
mkdirSync2(d, { recursive: true });
|
|
676
1079
|
return d;
|
|
677
1080
|
}
|
|
678
1081
|
function loadCached(filePath, root = ".") {
|
|
@@ -682,20 +1085,20 @@ function loadCached(filePath, root = ".") {
|
|
|
682
1085
|
} catch {
|
|
683
1086
|
return null;
|
|
684
1087
|
}
|
|
685
|
-
const entry =
|
|
686
|
-
if (!
|
|
1088
|
+
const entry = join2(cacheDir(root), `${h}.json`);
|
|
1089
|
+
if (!existsSync3(entry)) return null;
|
|
687
1090
|
try {
|
|
688
|
-
return JSON.parse(
|
|
1091
|
+
return JSON.parse(readFileSync3(entry, "utf-8"));
|
|
689
1092
|
} catch {
|
|
690
1093
|
return null;
|
|
691
1094
|
}
|
|
692
1095
|
}
|
|
693
1096
|
function saveCached(filePath, result, root = ".") {
|
|
694
1097
|
const h = fileHash(filePath);
|
|
695
|
-
const entry =
|
|
1098
|
+
const entry = join2(cacheDir(root), `${h}.json`);
|
|
696
1099
|
const tmp = entry + ".tmp";
|
|
697
1100
|
try {
|
|
698
|
-
|
|
1101
|
+
writeFileSync2(tmp, JSON.stringify(result));
|
|
699
1102
|
renameSync(tmp, entry);
|
|
700
1103
|
} catch {
|
|
701
1104
|
try {
|
|
@@ -745,8 +1148,8 @@ function saveSemanticCache(nodes, edges, hyperedges = null, root = ".") {
|
|
|
745
1148
|
}
|
|
746
1149
|
let saved = 0;
|
|
747
1150
|
for (const [fpath, result] of byFile) {
|
|
748
|
-
const p =
|
|
749
|
-
if (
|
|
1151
|
+
const p = resolve2(root, fpath);
|
|
1152
|
+
if (existsSync3(p)) {
|
|
750
1153
|
saveCached(p, result, root);
|
|
751
1154
|
saved++;
|
|
752
1155
|
}
|
|
@@ -754,9 +1157,6 @@ function saveSemanticCache(nodes, edges, hyperedges = null, root = ".") {
|
|
|
754
1157
|
return saved;
|
|
755
1158
|
}
|
|
756
1159
|
|
|
757
|
-
// src/build.ts
|
|
758
|
-
import Graph2 from "graphology";
|
|
759
|
-
|
|
760
1160
|
// src/validate.ts
|
|
761
1161
|
var VALID_FILE_TYPES = /* @__PURE__ */ new Set(["code", "document", "paper", "image", "rationale"]);
|
|
762
1162
|
var VALID_CONFIDENCES = /* @__PURE__ */ new Set(["EXTRACTED", "INFERRED", "AMBIGUOUS"]);
|
|
@@ -784,417 +1184,85 @@ function validateExtraction(data) {
|
|
|
784
1184
|
errors.push(
|
|
785
1185
|
`Node ${i} (id=${JSON.stringify(node.id ?? "?")}) missing required field '${field}'`
|
|
786
1186
|
);
|
|
787
|
-
}
|
|
788
|
-
}
|
|
789
|
-
if ("file_type" in node && !VALID_FILE_TYPES.has(node.file_type)) {
|
|
790
|
-
errors.push(
|
|
791
|
-
`Node ${i} (id=${JSON.stringify(node.id ?? "?")}) has invalid file_type '${node.file_type}' - must be one of ${JSON.stringify([...VALID_FILE_TYPES].sort())}`
|
|
792
|
-
);
|
|
793
|
-
}
|
|
794
|
-
}
|
|
795
|
-
}
|
|
796
|
-
if (!("edges" in d)) {
|
|
797
|
-
errors.push("Missing required key 'edges'");
|
|
798
|
-
} else if (!Array.isArray(d.edges)) {
|
|
799
|
-
errors.push("'edges' must be a list");
|
|
800
|
-
} else {
|
|
801
|
-
const nodeIds = /* @__PURE__ */ new Set();
|
|
802
|
-
if (Array.isArray(d.nodes)) {
|
|
803
|
-
for (const n of d.nodes) {
|
|
804
|
-
if (typeof n === "object" && n !== null && "id" in n) {
|
|
805
|
-
nodeIds.add(n.id);
|
|
806
|
-
}
|
|
807
|
-
}
|
|
808
|
-
}
|
|
809
|
-
for (let i = 0; i < d.edges.length; i++) {
|
|
810
|
-
const edge = d.edges[i];
|
|
811
|
-
if (typeof edge !== "object" || edge === null || Array.isArray(edge)) {
|
|
812
|
-
errors.push(`Edge ${i} must be an object`);
|
|
813
|
-
continue;
|
|
814
|
-
}
|
|
815
|
-
for (const field of REQUIRED_EDGE_FIELDS) {
|
|
816
|
-
if (!(field in edge)) {
|
|
817
|
-
errors.push(`Edge ${i} missing required field '${field}'`);
|
|
818
|
-
}
|
|
819
|
-
}
|
|
820
|
-
if ("confidence" in edge && !VALID_CONFIDENCES.has(edge.confidence)) {
|
|
821
|
-
errors.push(
|
|
822
|
-
`Edge ${i} has invalid confidence '${edge.confidence}' - must be one of ${JSON.stringify([...VALID_CONFIDENCES].sort())}`
|
|
823
|
-
);
|
|
824
|
-
}
|
|
825
|
-
if ("source" in edge && nodeIds.size > 0 && !nodeIds.has(edge.source)) {
|
|
826
|
-
errors.push(`Edge ${i} source '${edge.source}' does not match any node id`);
|
|
827
|
-
}
|
|
828
|
-
if ("target" in edge && nodeIds.size > 0 && !nodeIds.has(edge.target)) {
|
|
829
|
-
errors.push(`Edge ${i} target '${edge.target}' does not match any node id`);
|
|
830
|
-
}
|
|
831
|
-
}
|
|
832
|
-
}
|
|
833
|
-
return errors;
|
|
834
|
-
}
|
|
835
|
-
|
|
836
|
-
// src/build.ts
|
|
837
|
-
function buildFromJson(extraction) {
|
|
838
|
-
const errors = validateExtraction(extraction);
|
|
839
|
-
const realErrors = errors.filter((e) => !e.includes("does not match any node id"));
|
|
840
|
-
if (realErrors.length > 0) {
|
|
841
|
-
console.error(
|
|
842
|
-
`[graphify] Extraction warning (${realErrors.length} issues): ${realErrors[0]}`
|
|
843
|
-
);
|
|
844
|
-
}
|
|
845
|
-
const G = new Graph2({ type: "undirected", multi: false });
|
|
846
|
-
for (const node of extraction.nodes ?? []) {
|
|
847
|
-
const { id, ...attrs } = node;
|
|
848
|
-
G.mergeNode(id, attrs);
|
|
849
|
-
}
|
|
850
|
-
const nodeSet = new Set(G.nodes());
|
|
851
|
-
for (const edge of extraction.edges ?? []) {
|
|
852
|
-
const { source, target, ...attrs } = edge;
|
|
853
|
-
if (!nodeSet.has(source) || !nodeSet.has(target)) continue;
|
|
854
|
-
attrs._src = source;
|
|
855
|
-
attrs._tgt = target;
|
|
856
|
-
try {
|
|
857
|
-
G.mergeEdge(source, target, attrs);
|
|
858
|
-
} catch {
|
|
859
|
-
}
|
|
860
|
-
}
|
|
861
|
-
const hyperedges = extraction.hyperedges ?? [];
|
|
862
|
-
if (hyperedges.length > 0) {
|
|
863
|
-
G.setAttribute("hyperedges", hyperedges);
|
|
864
|
-
}
|
|
865
|
-
return G;
|
|
866
|
-
}
|
|
867
|
-
|
|
868
|
-
// src/detect.ts
|
|
869
|
-
import {
|
|
870
|
-
readdirSync as readdirSync2,
|
|
871
|
-
readFileSync as readFileSync3,
|
|
872
|
-
writeFileSync as writeFileSync2,
|
|
873
|
-
statSync,
|
|
874
|
-
existsSync as existsSync3,
|
|
875
|
-
mkdirSync as mkdirSync2,
|
|
876
|
-
lstatSync
|
|
877
|
-
} from "fs";
|
|
878
|
-
import { join as join2, resolve as resolve2, extname, basename, relative, sep } from "path";
|
|
879
|
-
import { createHash as createHash2 } from "crypto";
|
|
880
|
-
var MANIFEST_PATH = "graphify-out/manifest.json";
|
|
881
|
-
var CODE_EXTENSIONS2 = /* @__PURE__ */ new Set([
|
|
882
|
-
".py",
|
|
883
|
-
".ts",
|
|
884
|
-
".js",
|
|
885
|
-
".jsx",
|
|
886
|
-
".tsx",
|
|
887
|
-
".go",
|
|
888
|
-
".rs",
|
|
889
|
-
".java",
|
|
890
|
-
".cpp",
|
|
891
|
-
".cc",
|
|
892
|
-
".cxx",
|
|
893
|
-
".c",
|
|
894
|
-
".h",
|
|
895
|
-
".hpp",
|
|
896
|
-
".rb",
|
|
897
|
-
".swift",
|
|
898
|
-
".kt",
|
|
899
|
-
".kts",
|
|
900
|
-
".cs",
|
|
901
|
-
".scala",
|
|
902
|
-
".php",
|
|
903
|
-
".lua",
|
|
904
|
-
".toc",
|
|
905
|
-
".zig",
|
|
906
|
-
".ps1",
|
|
907
|
-
".ex",
|
|
908
|
-
".exs",
|
|
909
|
-
".m",
|
|
910
|
-
".mm",
|
|
911
|
-
".jl"
|
|
912
|
-
]);
|
|
913
|
-
var DOC_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt", ".rst"]);
|
|
914
|
-
var PAPER_EXTENSIONS2 = /* @__PURE__ */ new Set([".pdf"]);
|
|
915
|
-
var IMAGE_EXTENSIONS2 = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg"]);
|
|
916
|
-
var OFFICE_EXTENSIONS = /* @__PURE__ */ new Set([".docx", ".xlsx"]);
|
|
917
|
-
var CORPUS_WARN_THRESHOLD = 5e4;
|
|
918
|
-
var CORPUS_UPPER_THRESHOLD = 5e5;
|
|
919
|
-
var FILE_COUNT_UPPER = 200;
|
|
920
|
-
var SENSITIVE_PATTERNS = [
|
|
921
|
-
/(^|[\\/])\.(env|envrc)(\.|$)/i,
|
|
922
|
-
/\.(pem|key|p12|pfx|cert|crt|der|p8)$/i,
|
|
923
|
-
/(credential|secret|passwd|password|token|private_key)/i,
|
|
924
|
-
/(id_rsa|id_dsa|id_ecdsa|id_ed25519)(\.pub)?$/,
|
|
925
|
-
/(\.netrc|\.pgpass|\.htpasswd)$/i,
|
|
926
|
-
/(aws_credentials|gcloud_credentials|service.account)/i
|
|
927
|
-
];
|
|
928
|
-
var PAPER_SIGNALS = [
|
|
929
|
-
/\barxiv\b/i,
|
|
930
|
-
/\bdoi\s*:/i,
|
|
931
|
-
/\babstract\b/i,
|
|
932
|
-
/\bproceedings\b/i,
|
|
933
|
-
/\bjournal\b/i,
|
|
934
|
-
/\bpreprint\b/i,
|
|
935
|
-
/\\cite\{/,
|
|
936
|
-
/\[\d+\]/,
|
|
937
|
-
/\[\n\d+\n\]/,
|
|
938
|
-
/eq\.\s*\d+|equation\s+\d+/i,
|
|
939
|
-
/\d{4}\.\d{4,5}/,
|
|
940
|
-
/\bwe propose\b/i,
|
|
941
|
-
/\bliterature\b/i
|
|
942
|
-
];
|
|
943
|
-
var PAPER_SIGNAL_THRESHOLD = 3;
|
|
944
|
-
function isSensitive(filePath) {
|
|
945
|
-
const name = basename(filePath);
|
|
946
|
-
return SENSITIVE_PATTERNS.some((p) => p.test(name) || p.test(filePath));
|
|
947
|
-
}
|
|
948
|
-
function looksLikePaper(filePath) {
|
|
949
|
-
try {
|
|
950
|
-
const text = readFileSync3(filePath, "utf-8").slice(0, 3e3);
|
|
951
|
-
const hits = PAPER_SIGNALS.filter((p) => p.test(text)).length;
|
|
952
|
-
return hits >= PAPER_SIGNAL_THRESHOLD;
|
|
953
|
-
} catch {
|
|
954
|
-
return false;
|
|
955
|
-
}
|
|
956
|
-
}
|
|
957
|
-
var ASSET_DIR_MARKERS = /* @__PURE__ */ new Set([".imageset", ".xcassets", ".appiconset", ".colorset", ".launchimage"]);
|
|
958
|
-
function classifyFile(filePath) {
|
|
959
|
-
const ext = extname(filePath).toLowerCase();
|
|
960
|
-
if (CODE_EXTENSIONS2.has(ext)) return "code" /* CODE */;
|
|
961
|
-
if (PAPER_EXTENSIONS2.has(ext)) {
|
|
962
|
-
const parts = filePath.split(sep);
|
|
963
|
-
if (parts.some((p) => [...ASSET_DIR_MARKERS].some((m) => p.endsWith(m)))) return null;
|
|
964
|
-
return "paper" /* PAPER */;
|
|
965
|
-
}
|
|
966
|
-
if (IMAGE_EXTENSIONS2.has(ext)) return "image" /* IMAGE */;
|
|
967
|
-
if (DOC_EXTENSIONS.has(ext)) {
|
|
968
|
-
if (looksLikePaper(filePath)) return "paper" /* PAPER */;
|
|
969
|
-
return "document" /* DOCUMENT */;
|
|
970
|
-
}
|
|
971
|
-
if (OFFICE_EXTENSIONS.has(ext)) return "document" /* DOCUMENT */;
|
|
972
|
-
return null;
|
|
973
|
-
}
|
|
974
|
-
function countWords(filePath) {
|
|
975
|
-
try {
|
|
976
|
-
const text = readFileSync3(filePath, "utf-8");
|
|
977
|
-
return text.split(/\s+/).filter(Boolean).length;
|
|
978
|
-
} catch {
|
|
979
|
-
return 0;
|
|
980
|
-
}
|
|
981
|
-
}
|
|
982
|
-
var SKIP_DIRS = /* @__PURE__ */ new Set([
|
|
983
|
-
"venv",
|
|
984
|
-
".venv",
|
|
985
|
-
"env",
|
|
986
|
-
".env",
|
|
987
|
-
"node_modules",
|
|
988
|
-
"__pycache__",
|
|
989
|
-
".git",
|
|
990
|
-
"dist",
|
|
991
|
-
"build",
|
|
992
|
-
"target",
|
|
993
|
-
"out",
|
|
994
|
-
"site-packages",
|
|
995
|
-
"lib64",
|
|
996
|
-
".pytest_cache",
|
|
997
|
-
".mypy_cache",
|
|
998
|
-
".ruff_cache",
|
|
999
|
-
".tox",
|
|
1000
|
-
".eggs"
|
|
1001
|
-
]);
|
|
1002
|
-
function isNoiseDir(part) {
|
|
1003
|
-
if (SKIP_DIRS.has(part)) return true;
|
|
1004
|
-
if (part.endsWith("_venv") || part.endsWith("_env")) return true;
|
|
1005
|
-
if (part.endsWith(".egg-info")) return true;
|
|
1006
|
-
return false;
|
|
1007
|
-
}
|
|
1008
|
-
function loadGraphifyignore(root) {
|
|
1009
|
-
const ignoreFile = join2(root, ".graphifyignore");
|
|
1010
|
-
if (!existsSync3(ignoreFile)) return [];
|
|
1011
|
-
const patterns = [];
|
|
1012
|
-
for (let line of readFileSync3(ignoreFile, "utf-8").split("\n")) {
|
|
1013
|
-
line = line.trim();
|
|
1014
|
-
if (line && !line.startsWith("#")) {
|
|
1015
|
-
patterns.push(line);
|
|
1016
|
-
}
|
|
1017
|
-
}
|
|
1018
|
-
return patterns;
|
|
1019
|
-
}
|
|
1020
|
-
function matchGlob(text, pattern) {
|
|
1021
|
-
const regex = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
1022
|
-
return new RegExp(`^${regex}$`).test(text);
|
|
1023
|
-
}
|
|
1024
|
-
function isIgnored(filePath, root, patterns) {
|
|
1025
|
-
if (patterns.length === 0) return false;
|
|
1026
|
-
let rel;
|
|
1027
|
-
try {
|
|
1028
|
-
rel = relative(root, filePath).replace(/\\/g, "/");
|
|
1029
|
-
} catch {
|
|
1030
|
-
return false;
|
|
1031
|
-
}
|
|
1032
|
-
const parts = rel.split("/");
|
|
1033
|
-
for (const pattern of patterns) {
|
|
1034
|
-
const p = pattern.replace(/^\/+|\/+$/g, "");
|
|
1035
|
-
if (!p) continue;
|
|
1036
|
-
if (matchGlob(rel, p)) return true;
|
|
1037
|
-
if (matchGlob(basename(filePath), p)) return true;
|
|
1038
|
-
for (let i = 0; i < parts.length; i++) {
|
|
1039
|
-
if (matchGlob(parts[i], p)) return true;
|
|
1040
|
-
if (matchGlob(parts.slice(0, i + 1).join("/"), p)) return true;
|
|
1041
|
-
}
|
|
1042
|
-
}
|
|
1043
|
-
return false;
|
|
1044
|
-
}
|
|
1045
|
-
function walkDir(dir, root, ignorePatterns, followSymlinks, skipPrune) {
|
|
1046
|
-
const result = [];
|
|
1047
|
-
let entries;
|
|
1048
|
-
try {
|
|
1049
|
-
entries = readdirSync2(dir);
|
|
1050
|
-
} catch {
|
|
1051
|
-
return result;
|
|
1052
|
-
}
|
|
1053
|
-
for (const entry of entries) {
|
|
1054
|
-
const full = join2(dir, entry);
|
|
1055
|
-
let stat;
|
|
1056
|
-
try {
|
|
1057
|
-
stat = followSymlinks ? statSync(full) : lstatSync(full);
|
|
1058
|
-
} catch {
|
|
1059
|
-
continue;
|
|
1060
|
-
}
|
|
1061
|
-
if (stat.isDirectory()) {
|
|
1062
|
-
if (!skipPrune) {
|
|
1063
|
-
if (entry.startsWith(".")) continue;
|
|
1064
|
-
if (isNoiseDir(entry)) continue;
|
|
1065
|
-
if (isIgnored(full, root, ignorePatterns)) continue;
|
|
1066
|
-
}
|
|
1067
|
-
result.push(...walkDir(full, root, ignorePatterns, followSymlinks, skipPrune));
|
|
1068
|
-
} else if (stat.isFile()) {
|
|
1069
|
-
result.push(full);
|
|
1070
|
-
}
|
|
1071
|
-
}
|
|
1072
|
-
return result;
|
|
1073
|
-
}
|
|
1074
|
-
function detect(root, options) {
|
|
1075
|
-
const followSymlinks = options?.followSymlinks ?? false;
|
|
1076
|
-
const rootResolved = resolve2(root);
|
|
1077
|
-
const ignorePatterns = loadGraphifyignore(rootResolved);
|
|
1078
|
-
const convertedDir = join2(rootResolved, "graphify-out", "converted");
|
|
1079
|
-
const memoryDir = join2(rootResolved, "graphify-out", "memory");
|
|
1080
|
-
const files = {
|
|
1081
|
-
code: [],
|
|
1082
|
-
document: [],
|
|
1083
|
-
paper: [],
|
|
1084
|
-
image: []
|
|
1085
|
-
};
|
|
1086
|
-
let totalWords = 0;
|
|
1087
|
-
const skippedSensitive = [];
|
|
1088
|
-
const allFiles = walkDir(rootResolved, rootResolved, ignorePatterns, followSymlinks, false);
|
|
1089
|
-
if (existsSync3(memoryDir)) {
|
|
1090
|
-
allFiles.push(...walkDir(memoryDir, rootResolved, ignorePatterns, followSymlinks, true));
|
|
1091
|
-
}
|
|
1092
|
-
const seen = /* @__PURE__ */ new Set();
|
|
1093
|
-
for (const p of allFiles) {
|
|
1094
|
-
if (seen.has(p)) continue;
|
|
1095
|
-
seen.add(p);
|
|
1096
|
-
const inMemory = existsSync3(memoryDir) && p.startsWith(memoryDir);
|
|
1097
|
-
if (!inMemory) {
|
|
1098
|
-
if (basename(p).startsWith(".")) continue;
|
|
1099
|
-
if (p.startsWith(convertedDir)) continue;
|
|
1100
|
-
}
|
|
1101
|
-
if (isIgnored(p, rootResolved, ignorePatterns)) continue;
|
|
1102
|
-
if (isSensitive(p)) {
|
|
1103
|
-
skippedSensitive.push(p);
|
|
1104
|
-
continue;
|
|
1105
|
-
}
|
|
1106
|
-
const ftype = classifyFile(p);
|
|
1107
|
-
if (!ftype) continue;
|
|
1108
|
-
if (OFFICE_EXTENSIONS.has(extname(p).toLowerCase())) {
|
|
1109
|
-
skippedSensitive.push(p + " [office conversion requires async - use pipeline]");
|
|
1110
|
-
continue;
|
|
1111
|
-
}
|
|
1112
|
-
files[ftype].push(p);
|
|
1113
|
-
totalWords += countWords(p);
|
|
1114
|
-
}
|
|
1115
|
-
const totalFiles = Object.values(files).reduce((s, v) => s + v.length, 0);
|
|
1116
|
-
const needsGraph = totalWords >= CORPUS_WARN_THRESHOLD;
|
|
1117
|
-
let warning = null;
|
|
1118
|
-
if (!needsGraph) {
|
|
1119
|
-
warning = `Corpus is ~${totalWords.toLocaleString()} words - fits in a single context window. You may not need a graph.`;
|
|
1120
|
-
} else if (totalWords >= CORPUS_UPPER_THRESHOLD || totalFiles >= FILE_COUNT_UPPER) {
|
|
1121
|
-
warning = `Large corpus: ${totalFiles} files \xB7 ~${totalWords.toLocaleString()} words. Semantic extraction will be expensive (many Claude tokens). Consider running on a subfolder, or use --no-semantic to run AST-only.`;
|
|
1122
|
-
}
|
|
1123
|
-
return {
|
|
1124
|
-
files,
|
|
1125
|
-
total_files: totalFiles,
|
|
1126
|
-
total_words: totalWords,
|
|
1127
|
-
needs_graph: needsGraph,
|
|
1128
|
-
warning,
|
|
1129
|
-
skipped_sensitive: skippedSensitive,
|
|
1130
|
-
graphifyignore_patterns: ignorePatterns.length
|
|
1131
|
-
};
|
|
1132
|
-
}
|
|
1133
|
-
function loadManifest(manifestPath = MANIFEST_PATH) {
|
|
1134
|
-
try {
|
|
1135
|
-
return JSON.parse(readFileSync3(manifestPath, "utf-8"));
|
|
1136
|
-
} catch {
|
|
1137
|
-
return {};
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
if ("file_type" in node && !VALID_FILE_TYPES.has(node.file_type)) {
|
|
1190
|
+
errors.push(
|
|
1191
|
+
`Node ${i} (id=${JSON.stringify(node.id ?? "?")}) has invalid file_type '${node.file_type}' - must be one of ${JSON.stringify([...VALID_FILE_TYPES].sort())}`
|
|
1192
|
+
);
|
|
1193
|
+
}
|
|
1194
|
+
}
|
|
1138
1195
|
}
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1196
|
+
if (!("edges" in d)) {
|
|
1197
|
+
errors.push("Missing required key 'edges'");
|
|
1198
|
+
} else if (!Array.isArray(d.edges)) {
|
|
1199
|
+
errors.push("'edges' must be a list");
|
|
1200
|
+
} else {
|
|
1201
|
+
const nodeIds = /* @__PURE__ */ new Set();
|
|
1202
|
+
if (Array.isArray(d.nodes)) {
|
|
1203
|
+
for (const n of d.nodes) {
|
|
1204
|
+
if (typeof n === "object" && n !== null && "id" in n) {
|
|
1205
|
+
nodeIds.add(n.id);
|
|
1206
|
+
}
|
|
1207
|
+
}
|
|
1208
|
+
}
|
|
1209
|
+
for (let i = 0; i < d.edges.length; i++) {
|
|
1210
|
+
const edge = d.edges[i];
|
|
1211
|
+
if (typeof edge !== "object" || edge === null || Array.isArray(edge)) {
|
|
1212
|
+
errors.push(`Edge ${i} must be an object`);
|
|
1213
|
+
continue;
|
|
1214
|
+
}
|
|
1215
|
+
for (const field of REQUIRED_EDGE_FIELDS) {
|
|
1216
|
+
if (!(field in edge)) {
|
|
1217
|
+
errors.push(`Edge ${i} missing required field '${field}'`);
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
if ("confidence" in edge && !VALID_CONFIDENCES.has(edge.confidence)) {
|
|
1221
|
+
errors.push(
|
|
1222
|
+
`Edge ${i} has invalid confidence '${edge.confidence}' - must be one of ${JSON.stringify([...VALID_CONFIDENCES].sort())}`
|
|
1223
|
+
);
|
|
1224
|
+
}
|
|
1225
|
+
if ("source" in edge && nodeIds.size > 0 && !nodeIds.has(edge.source)) {
|
|
1226
|
+
errors.push(`Edge ${i} source '${edge.source}' does not match any node id`);
|
|
1227
|
+
}
|
|
1228
|
+
if ("target" in edge && nodeIds.size > 0 && !nodeIds.has(edge.target)) {
|
|
1229
|
+
errors.push(`Edge ${i} target '${edge.target}' does not match any node id`);
|
|
1147
1230
|
}
|
|
1148
1231
|
}
|
|
1149
1232
|
}
|
|
1150
|
-
|
|
1151
|
-
mkdirSync2(dir, { recursive: true });
|
|
1152
|
-
writeFileSync2(manifestPath, JSON.stringify(manifest, null, 2));
|
|
1233
|
+
return errors;
|
|
1153
1234
|
}
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
new_total: full.total_files
|
|
1164
|
-
};
|
|
1235
|
+
|
|
1236
|
+
// src/build.ts
|
|
1237
|
+
function buildFromJson(extraction, options) {
|
|
1238
|
+
const errors = validateExtraction(extraction);
|
|
1239
|
+
const realErrors = errors.filter((e) => !e.includes("does not match any node id"));
|
|
1240
|
+
if (realErrors.length > 0) {
|
|
1241
|
+
console.error(
|
|
1242
|
+
`[graphify] Extraction warning (${realErrors.length} issues): ${realErrors[0]}`
|
|
1243
|
+
);
|
|
1165
1244
|
}
|
|
1166
|
-
const
|
|
1167
|
-
const
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
unchangedFiles[k] = [];
|
|
1245
|
+
const G = createGraph(options?.directed === true);
|
|
1246
|
+
for (const node of extraction.nodes ?? []) {
|
|
1247
|
+
const { id, ...attrs } = node;
|
|
1248
|
+
G.mergeNode(id, attrs);
|
|
1171
1249
|
}
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
newFiles[ftype].push(f);
|
|
1182
|
-
} else {
|
|
1183
|
-
unchangedFiles[ftype].push(f);
|
|
1184
|
-
}
|
|
1250
|
+
const nodeSet = new Set(G.nodes());
|
|
1251
|
+
for (const edge of extraction.edges ?? []) {
|
|
1252
|
+
const { source, target, ...attrs } = edge;
|
|
1253
|
+
if (!nodeSet.has(source) || !nodeSet.has(target)) continue;
|
|
1254
|
+
attrs._src = source;
|
|
1255
|
+
attrs._tgt = target;
|
|
1256
|
+
try {
|
|
1257
|
+
G.mergeEdge(source, target, attrs);
|
|
1258
|
+
} catch {
|
|
1185
1259
|
}
|
|
1186
1260
|
}
|
|
1187
|
-
const
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
incremental: true,
|
|
1193
|
-
new_files: newFiles,
|
|
1194
|
-
unchanged_files: unchangedFiles,
|
|
1195
|
-
new_total: newTotal,
|
|
1196
|
-
deleted_files: deletedFiles
|
|
1197
|
-
};
|
|
1261
|
+
const hyperedges = extraction.hyperedges ?? [];
|
|
1262
|
+
if (hyperedges.length > 0) {
|
|
1263
|
+
G.setAttribute("hyperedges", hyperedges);
|
|
1264
|
+
}
|
|
1265
|
+
return G;
|
|
1198
1266
|
}
|
|
1199
1267
|
|
|
1200
1268
|
// src/export.ts
|
|
@@ -1360,14 +1428,17 @@ function normalizeCommunityLabels(labelsOrOptions) {
|
|
|
1360
1428
|
}
|
|
1361
1429
|
return toNumericMap(labelsOrOptions.communityLabels);
|
|
1362
1430
|
}
|
|
1363
|
-
function toJson(G, communities, outputPath) {
|
|
1431
|
+
function toJson(G, communities, outputPath, communityLabelsOrOptions) {
|
|
1364
1432
|
const nodeComm = nodeCommunityMap2(communities);
|
|
1433
|
+
const communityLabels = normalizeCommunityLabels(communityLabelsOrOptions);
|
|
1365
1434
|
const nodes = [];
|
|
1366
1435
|
G.forEachNode((nodeId, attrs) => {
|
|
1436
|
+
const communityId = nodeComm.get(nodeId) ?? null;
|
|
1367
1437
|
nodes.push({
|
|
1368
1438
|
id: nodeId,
|
|
1369
1439
|
...attrs,
|
|
1370
|
-
community:
|
|
1440
|
+
community: communityId,
|
|
1441
|
+
community_name: communityId !== null ? sanitizeLabel(communityLabels?.get(communityId) ?? `Community ${communityId}`) : null
|
|
1371
1442
|
});
|
|
1372
1443
|
});
|
|
1373
1444
|
const links = [];
|
|
@@ -1384,10 +1455,15 @@ function toJson(G, communities, outputPath) {
|
|
|
1384
1455
|
links.push(link);
|
|
1385
1456
|
});
|
|
1386
1457
|
const hyperedges = G.getAttribute("hyperedges") ?? [];
|
|
1458
|
+
const communityLabelsObject = communityLabels ? Object.fromEntries(
|
|
1459
|
+
[...communityLabels.entries()].sort((a, b) => a[0] - b[0]).map(([cid, label]) => [String(cid), sanitizeLabel(label)])
|
|
1460
|
+
) : {};
|
|
1387
1461
|
const output = {
|
|
1388
|
-
directed:
|
|
1462
|
+
directed: isDirectedGraph(G),
|
|
1389
1463
|
multigraph: false,
|
|
1390
|
-
graph: {
|
|
1464
|
+
graph: {
|
|
1465
|
+
community_labels: communityLabelsObject
|
|
1466
|
+
},
|
|
1391
1467
|
nodes,
|
|
1392
1468
|
links,
|
|
1393
1469
|
hyperedges
|
|
@@ -1653,9 +1729,24 @@ function focusNode(nodeId) {
|
|
|
1653
1729
|
showInfo(nodeId);
|
|
1654
1730
|
}
|
|
1655
1731
|
|
|
1732
|
+
let hoveredNodeId = null;
|
|
1733
|
+
network.on('hoverNode', params => {
|
|
1734
|
+
hoveredNodeId = params.node;
|
|
1735
|
+
container.style.cursor = 'pointer';
|
|
1736
|
+
});
|
|
1737
|
+
network.on('blurNode', () => {
|
|
1738
|
+
hoveredNodeId = null;
|
|
1739
|
+
container.style.cursor = 'default';
|
|
1740
|
+
});
|
|
1741
|
+
container.addEventListener('click', () => {
|
|
1742
|
+
if (hoveredNodeId !== null) {
|
|
1743
|
+
showInfo(hoveredNodeId);
|
|
1744
|
+
network.selectNodes([hoveredNodeId]);
|
|
1745
|
+
}
|
|
1746
|
+
});
|
|
1656
1747
|
network.on('click', params => {
|
|
1657
1748
|
if (params.nodes.length > 0) showInfo(params.nodes[0]);
|
|
1658
|
-
else document.getElementById('info-content').innerHTML = '<span class="empty">Click a node to inspect it</span>';
|
|
1749
|
+
else if (hoveredNodeId === null) document.getElementById('info-content').innerHTML = '<span class="empty">Click a node to inspect it</span>';
|
|
1659
1750
|
});
|
|
1660
1751
|
|
|
1661
1752
|
const searchInput = document.getElementById('search');
|
|
@@ -1826,7 +1917,7 @@ function toGraphml(G, communities, outputPath) {
|
|
|
1826
1917
|
lines.push(' <key id="community" for="node" attr.name="community" attr.type="int"/>');
|
|
1827
1918
|
lines.push(' <key id="relation" for="edge" attr.name="relation" attr.type="string"/>');
|
|
1828
1919
|
lines.push(' <key id="confidence" for="edge" attr.name="confidence" attr.type="string"/>');
|
|
1829
|
-
lines.push(
|
|
1920
|
+
lines.push(` <graph id="G" edgedefault="${isDirectedGraph(G) ? "directed" : "undirected"}">`);
|
|
1830
1921
|
G.forEachNode((nodeId, data) => {
|
|
1831
1922
|
lines.push(` <node id="${xmlEsc(nodeId)}">`);
|
|
1832
1923
|
lines.push(` <data key="label">${xmlEsc(data.label ?? nodeId)}</data>`);
|
|
@@ -1921,7 +2012,7 @@ function toSvg(G, communities, outputPath, communityLabelsOrOptions, figsize = [
|
|
|
1921
2012
|
|
|
1922
2013
|
// src/extract.ts
|
|
1923
2014
|
import { readFileSync as readFileSync4, readdirSync as readdirSync3, lstatSync as lstatSync2, realpathSync, existsSync as existsSync5 } from "fs";
|
|
1924
|
-
import { resolve as resolve3, basename as basename2, extname as
|
|
2015
|
+
import { resolve as resolve3, basename as basename2, extname as extname3, dirname as dirname2, join as join3, sep as sep2 } from "path";
|
|
1925
2016
|
import { createRequire } from "module";
|
|
1926
2017
|
import * as TreeSitter from "web-tree-sitter";
|
|
1927
2018
|
var Parser2 = TreeSitter.Parser ?? TreeSitter.default;
|
|
@@ -2562,7 +2653,7 @@ async function _extractGeneric(filePath, config) {
|
|
|
2562
2653
|
return { nodes: [], edges: [], error: String(e) };
|
|
2563
2654
|
}
|
|
2564
2655
|
const root = tree.rootNode;
|
|
2565
|
-
const stem = basename2(filePath,
|
|
2656
|
+
const stem = basename2(filePath, extname3(filePath));
|
|
2566
2657
|
const strPath = filePath;
|
|
2567
2658
|
const nodes = [];
|
|
2568
2659
|
const edges = [];
|
|
@@ -2931,10 +3022,10 @@ async function _extractGeneric(filePath, config) {
|
|
|
2931
3022
|
source: callerNid,
|
|
2932
3023
|
target: tgtNid,
|
|
2933
3024
|
relation: "calls",
|
|
2934
|
-
confidence: "
|
|
3025
|
+
confidence: "EXTRACTED",
|
|
2935
3026
|
source_file: strPath,
|
|
2936
3027
|
source_location: `L${line}`,
|
|
2937
|
-
weight:
|
|
3028
|
+
weight: 1
|
|
2938
3029
|
});
|
|
2939
3030
|
}
|
|
2940
3031
|
}
|
|
@@ -2979,7 +3070,7 @@ async function _extractPythonRationale(filePath, result) {
|
|
|
2979
3070
|
} catch {
|
|
2980
3071
|
return;
|
|
2981
3072
|
}
|
|
2982
|
-
const stem = basename2(filePath,
|
|
3073
|
+
const stem = basename2(filePath, extname3(filePath));
|
|
2983
3074
|
const strPath = filePath;
|
|
2984
3075
|
const { nodes, edges } = result;
|
|
2985
3076
|
const seenIds = new Set(nodes.map((n) => n.id));
|
|
@@ -3075,7 +3166,7 @@ async function extractPython(filePath) {
|
|
|
3075
3166
|
return result;
|
|
3076
3167
|
}
|
|
3077
3168
|
async function extractJs(filePath) {
|
|
3078
|
-
const ext =
|
|
3169
|
+
const ext = extname3(filePath);
|
|
3079
3170
|
const config = ext === ".ts" || ext === ".tsx" ? _TS_CONFIG : _JS_CONFIG;
|
|
3080
3171
|
return _extractGeneric(filePath, config);
|
|
3081
3172
|
}
|
|
@@ -3126,7 +3217,7 @@ async function extractJulia(filePath) {
|
|
|
3126
3217
|
return { nodes: [], edges: [], error: String(e) };
|
|
3127
3218
|
}
|
|
3128
3219
|
const root = tree.rootNode;
|
|
3129
|
-
const stem = basename2(filePath,
|
|
3220
|
+
const stem = basename2(filePath, extname3(filePath));
|
|
3130
3221
|
const strPath = filePath;
|
|
3131
3222
|
const nodes = [];
|
|
3132
3223
|
const edges = [];
|
|
@@ -3322,8 +3413,8 @@ async function extractGo(filePath) {
|
|
|
3322
3413
|
return { nodes: [], edges: [], error: String(e) };
|
|
3323
3414
|
}
|
|
3324
3415
|
const root = tree.rootNode;
|
|
3325
|
-
const stem = basename2(filePath,
|
|
3326
|
-
const pkgScope =
|
|
3416
|
+
const stem = basename2(filePath, extname3(filePath));
|
|
3417
|
+
const pkgScope = dirname2(filePath).split(sep2).pop() || stem;
|
|
3327
3418
|
const strPath = filePath;
|
|
3328
3419
|
const nodes = [];
|
|
3329
3420
|
const edges = [];
|
|
@@ -3466,10 +3557,10 @@ async function extractGo(filePath) {
|
|
|
3466
3557
|
source: callerNid,
|
|
3467
3558
|
target: tgtNid,
|
|
3468
3559
|
relation: "calls",
|
|
3469
|
-
confidence: "
|
|
3560
|
+
confidence: "EXTRACTED",
|
|
3470
3561
|
source_file: strPath,
|
|
3471
3562
|
source_location: `L${line}`,
|
|
3472
|
-
weight:
|
|
3563
|
+
weight: 1
|
|
3473
3564
|
});
|
|
3474
3565
|
}
|
|
3475
3566
|
}
|
|
@@ -3504,7 +3595,7 @@ async function extractRust(filePath) {
|
|
|
3504
3595
|
return { nodes: [], edges: [], error: String(e) };
|
|
3505
3596
|
}
|
|
3506
3597
|
const root = tree.rootNode;
|
|
3507
|
-
const stem = basename2(filePath,
|
|
3598
|
+
const stem = basename2(filePath, extname3(filePath));
|
|
3508
3599
|
const strPath = filePath;
|
|
3509
3600
|
const nodes = [];
|
|
3510
3601
|
const edges = [];
|
|
@@ -3621,10 +3712,10 @@ async function extractRust(filePath) {
|
|
|
3621
3712
|
source: callerNid,
|
|
3622
3713
|
target: tgtNid,
|
|
3623
3714
|
relation: "calls",
|
|
3624
|
-
confidence: "
|
|
3715
|
+
confidence: "EXTRACTED",
|
|
3625
3716
|
source_file: strPath,
|
|
3626
3717
|
source_location: `L${line}`,
|
|
3627
|
-
weight:
|
|
3718
|
+
weight: 1
|
|
3628
3719
|
});
|
|
3629
3720
|
}
|
|
3630
3721
|
}
|
|
@@ -3659,7 +3750,7 @@ async function extractZig(filePath) {
|
|
|
3659
3750
|
return { nodes: [], edges: [], error: String(e) };
|
|
3660
3751
|
}
|
|
3661
3752
|
const root = tree.rootNode;
|
|
3662
|
-
const stem = basename2(filePath,
|
|
3753
|
+
const stem = basename2(filePath, extname3(filePath));
|
|
3663
3754
|
const strPath = filePath;
|
|
3664
3755
|
const nodes = [];
|
|
3665
3756
|
const edges = [];
|
|
@@ -3783,7 +3874,7 @@ async function extractZig(filePath) {
|
|
|
3783
3874
|
const pair = `${callerNid}|${tgtNid}`;
|
|
3784
3875
|
if (!seenCallPairs.has(pair)) {
|
|
3785
3876
|
seenCallPairs.add(pair);
|
|
3786
|
-
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "
|
|
3877
|
+
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "EXTRACTED", 1);
|
|
3787
3878
|
}
|
|
3788
3879
|
}
|
|
3789
3880
|
}
|
|
@@ -3817,7 +3908,7 @@ async function extractPowershell(filePath) {
|
|
|
3817
3908
|
return { nodes: [], edges: [], error: String(e) };
|
|
3818
3909
|
}
|
|
3819
3910
|
const root = tree.rootNode;
|
|
3820
|
-
const stem = basename2(filePath,
|
|
3911
|
+
const stem = basename2(filePath, extname3(filePath));
|
|
3821
3912
|
const strPath = filePath;
|
|
3822
3913
|
const nodes = [];
|
|
3823
3914
|
const edges = [];
|
|
@@ -3966,7 +4057,7 @@ async function extractPowershell(filePath) {
|
|
|
3966
4057
|
const pair = `${callerNid}|${tgtNid}`;
|
|
3967
4058
|
if (!seenCallPairs.has(pair)) {
|
|
3968
4059
|
seenCallPairs.add(pair);
|
|
3969
|
-
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "
|
|
4060
|
+
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "EXTRACTED", 1);
|
|
3970
4061
|
}
|
|
3971
4062
|
}
|
|
3972
4063
|
}
|
|
@@ -4001,7 +4092,7 @@ async function extractObjc(filePath) {
|
|
|
4001
4092
|
return { nodes: [], edges: [], error: String(e) };
|
|
4002
4093
|
}
|
|
4003
4094
|
const root = tree.rootNode;
|
|
4004
|
-
const stem = basename2(filePath,
|
|
4095
|
+
const stem = basename2(filePath, extname3(filePath));
|
|
4005
4096
|
const strPath = filePath;
|
|
4006
4097
|
const nodes = [];
|
|
4007
4098
|
const edges = [];
|
|
@@ -4172,7 +4263,7 @@ async function extractObjc(filePath) {
|
|
|
4172
4263
|
const pair = `${callerNid}|${candidate}`;
|
|
4173
4264
|
if (!seenCalls.has(pair) && callerNid !== candidate) {
|
|
4174
4265
|
seenCalls.add(pair);
|
|
4175
|
-
addEdge(callerNid, candidate, "calls", bodyNode.startPosition.row + 1, "
|
|
4266
|
+
addEdge(callerNid, candidate, "calls", bodyNode.startPosition.row + 1, "EXTRACTED", 1);
|
|
4176
4267
|
}
|
|
4177
4268
|
}
|
|
4178
4269
|
}
|
|
@@ -4204,7 +4295,7 @@ async function extractElixir(filePath) {
|
|
|
4204
4295
|
return { nodes: [], edges: [], error: String(e) };
|
|
4205
4296
|
}
|
|
4206
4297
|
const root = tree.rootNode;
|
|
4207
|
-
const stem = basename2(filePath,
|
|
4298
|
+
const stem = basename2(filePath, extname3(filePath));
|
|
4208
4299
|
const strPath = filePath;
|
|
4209
4300
|
const nodes = [];
|
|
4210
4301
|
const edges = [];
|
|
@@ -4363,7 +4454,7 @@ async function extractElixir(filePath) {
|
|
|
4363
4454
|
const pair = `${callerNid}|${tgtNid}`;
|
|
4364
4455
|
if (!seenCallPairs.has(pair)) {
|
|
4365
4456
|
seenCallPairs.add(pair);
|
|
4366
|
-
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "
|
|
4457
|
+
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "EXTRACTED", 1);
|
|
4367
4458
|
}
|
|
4368
4459
|
}
|
|
4369
4460
|
}
|
|
@@ -4390,7 +4481,7 @@ async function _resolveCrossFileImports(perFile, paths) {
|
|
|
4390
4481
|
for (const node of fileResult.nodes ?? []) {
|
|
4391
4482
|
const src = node.source_file ?? "";
|
|
4392
4483
|
if (!src) continue;
|
|
4393
|
-
const fileStem = basename2(src,
|
|
4484
|
+
const fileStem = basename2(src, extname3(src));
|
|
4394
4485
|
const label = node.label ?? "";
|
|
4395
4486
|
const nid = node.id ?? "";
|
|
4396
4487
|
if (label && !label.endsWith(")") && !label.endsWith(".py") && !label.startsWith("_")) {
|
|
@@ -4402,7 +4493,7 @@ async function _resolveCrossFileImports(perFile, paths) {
|
|
|
4402
4493
|
const newEdges = [];
|
|
4403
4494
|
const stemToPath = /* @__PURE__ */ new Map();
|
|
4404
4495
|
for (const p of paths) {
|
|
4405
|
-
stemToPath.set(basename2(p,
|
|
4496
|
+
stemToPath.set(basename2(p, extname3(p)), p);
|
|
4406
4497
|
}
|
|
4407
4498
|
for (let idx = 0; idx < perFile.length; idx++) {
|
|
4408
4499
|
let walkImports = function(node) {
|
|
@@ -4466,7 +4557,7 @@ async function _resolveCrossFileImports(perFile, paths) {
|
|
|
4466
4557
|
};
|
|
4467
4558
|
const fileResult = perFile[idx];
|
|
4468
4559
|
const filePath = paths[idx];
|
|
4469
|
-
const fileStem = basename2(filePath,
|
|
4560
|
+
const fileStem = basename2(filePath, extname3(filePath));
|
|
4470
4561
|
const strPath = filePath;
|
|
4471
4562
|
const localClasses = fileResult.nodes.filter(
|
|
4472
4563
|
(n) => n.source_file === strPath && !n.label.endsWith(")") && !n.label.endsWith(".py") && n.id !== _makeId(fileStem)
|
|
@@ -4524,7 +4615,7 @@ async function extractWithDiagnostics(paths) {
|
|
|
4524
4615
|
if (paths.length === 0) {
|
|
4525
4616
|
root = ".";
|
|
4526
4617
|
} else if (paths.length === 1) {
|
|
4527
|
-
root =
|
|
4618
|
+
root = dirname2(paths[0]);
|
|
4528
4619
|
} else {
|
|
4529
4620
|
const parts = paths.map((p) => p.split(sep2));
|
|
4530
4621
|
const minLen = Math.min(...parts.map((p) => p.length));
|
|
@@ -4547,7 +4638,7 @@ async function extractWithDiagnostics(paths) {
|
|
|
4547
4638
|
`);
|
|
4548
4639
|
}
|
|
4549
4640
|
const filePath = paths[i];
|
|
4550
|
-
const ext =
|
|
4641
|
+
const ext = extname3(filePath);
|
|
4551
4642
|
const extractor = _DISPATCH[ext];
|
|
4552
4643
|
if (!extractor) continue;
|
|
4553
4644
|
const cached = loadCached(filePath, root);
|
|
@@ -4573,9 +4664,9 @@ async function extractWithDiagnostics(paths) {
|
|
|
4573
4664
|
allNodes.push(...result.nodes ?? []);
|
|
4574
4665
|
allEdges.push(...result.edges ?? []);
|
|
4575
4666
|
}
|
|
4576
|
-
const pyPaths = paths.filter((p) =>
|
|
4667
|
+
const pyPaths = paths.filter((p) => extname3(p) === ".py");
|
|
4577
4668
|
if (pyPaths.length > 0) {
|
|
4578
|
-
const pyResults = perFile.filter((_r, i) =>
|
|
4669
|
+
const pyResults = perFile.filter((_r, i) => extname3(paths[i]) === ".py");
|
|
4579
4670
|
try {
|
|
4580
4671
|
const crossFileEdges = await _resolveCrossFileImports(pyResults, pyPaths);
|
|
4581
4672
|
allEdges.push(...crossFileEdges);
|
|
@@ -4594,8 +4685,404 @@ async function extractWithDiagnostics(paths) {
|
|
|
4594
4685
|
}
|
|
4595
4686
|
|
|
4596
4687
|
// src/ingest.ts
|
|
4597
|
-
import { existsSync as
|
|
4598
|
-
import { resolve as pathResolve2, basename as
|
|
4688
|
+
import { existsSync as existsSync7, mkdirSync as mkdirSync4, writeFileSync as writeFileSync5 } from "fs";
|
|
4689
|
+
import { resolve as pathResolve2, basename as basename4, extname as extname5 } from "path";
|
|
4690
|
+
|
|
4691
|
+
// src/transcribe.ts
|
|
4692
|
+
import * as childProcess from "child_process";
|
|
4693
|
+
import { createHash as createHash3 } from "crypto";
|
|
4694
|
+
import {
|
|
4695
|
+
cpSync,
|
|
4696
|
+
createWriteStream,
|
|
4697
|
+
existsSync as existsSync6,
|
|
4698
|
+
mkdirSync as mkdirSync3,
|
|
4699
|
+
mkdtempSync,
|
|
4700
|
+
readdirSync as readdirSync4,
|
|
4701
|
+
renameSync as renameSync2,
|
|
4702
|
+
rmSync,
|
|
4703
|
+
writeFileSync as writeFileSync4
|
|
4704
|
+
} from "fs";
|
|
4705
|
+
import { homedir, platform, tmpdir } from "os";
|
|
4706
|
+
import { basename as basename3, dirname as dirname3, extname as extname4, join as join4, resolve as resolve5 } from "path";
|
|
4707
|
+
import { Readable } from "stream";
|
|
4708
|
+
import { pipeline } from "stream/promises";
|
|
4709
|
+
var URL_PREFIXES = ["http://", "https://", "www."];
|
|
4710
|
+
var CACHED_AUDIO_EXTENSIONS = [".m4a", ".opus", ".mp3", ".ogg", ".wav", ".webm"];
|
|
4711
|
+
var DEFAULT_MODEL = "base";
|
|
4712
|
+
var TRANSCRIPTS_DIR = "graphify-out/transcripts";
|
|
4713
|
+
var FALLBACK_PROMPT = "Use proper punctuation and paragraph breaks.";
|
|
4714
|
+
var SHERPA_RELEASE_BASE = "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models";
|
|
4715
|
+
var AUDIO_SAMPLE_RATE = 16e3;
|
|
4716
|
+
var SUPPORTED_MODELS = /* @__PURE__ */ new Set([
|
|
4717
|
+
"tiny",
|
|
4718
|
+
"tiny.en",
|
|
4719
|
+
"base",
|
|
4720
|
+
"base.en",
|
|
4721
|
+
"small",
|
|
4722
|
+
"small.en",
|
|
4723
|
+
"medium",
|
|
4724
|
+
"medium.en",
|
|
4725
|
+
"large-v1",
|
|
4726
|
+
"large-v2",
|
|
4727
|
+
"large-v3",
|
|
4728
|
+
"turbo",
|
|
4729
|
+
"distil-small.en",
|
|
4730
|
+
"distil-medium.en",
|
|
4731
|
+
"distil-large-v2",
|
|
4732
|
+
"distil-large-v3",
|
|
4733
|
+
"distil-large-v3.5"
|
|
4734
|
+
]);
|
|
4735
|
+
var MODEL_ALIASES = {
|
|
4736
|
+
large: "large-v3"
|
|
4737
|
+
};
|
|
4738
|
+
var recognizerCache = /* @__PURE__ */ new Map();
|
|
4739
|
+
var sherpaModulePromise = null;
|
|
4740
|
+
function runCommand(command, args, options) {
|
|
4741
|
+
const result = childProcess.spawnSync(command, args, {
|
|
4742
|
+
encoding: "utf-8",
|
|
4743
|
+
...options
|
|
4744
|
+
});
|
|
4745
|
+
if (result.error) {
|
|
4746
|
+
throw result.error;
|
|
4747
|
+
}
|
|
4748
|
+
if (result.status !== 0) {
|
|
4749
|
+
throw new Error(result.stderr?.trim() || result.stdout?.trim() || `${command} failed`);
|
|
4750
|
+
}
|
|
4751
|
+
return result;
|
|
4752
|
+
}
|
|
4753
|
+
function defaultWhisperCacheDir() {
|
|
4754
|
+
if (process.env.GRAPHIFY_WHISPER_CACHE_DIR) {
|
|
4755
|
+
return resolve5(process.env.GRAPHIFY_WHISPER_CACHE_DIR);
|
|
4756
|
+
}
|
|
4757
|
+
if (platform() === "win32") {
|
|
4758
|
+
return join4(
|
|
4759
|
+
process.env.LOCALAPPDATA ?? join4(homedir(), "AppData", "Local"),
|
|
4760
|
+
"graphify",
|
|
4761
|
+
"whisper"
|
|
4762
|
+
);
|
|
4763
|
+
}
|
|
4764
|
+
return join4(process.env.XDG_CACHE_HOME ?? join4(homedir(), ".cache"), "graphify", "whisper");
|
|
4765
|
+
}
|
|
4766
|
+
function ffmpegBinary() {
|
|
4767
|
+
return process.env.GRAPHIFY_FFMPEG_BIN ?? "ffmpeg";
|
|
4768
|
+
}
|
|
4769
|
+
function tarBinary() {
|
|
4770
|
+
return process.env.GRAPHIFY_TAR_BIN ?? "tar";
|
|
4771
|
+
}
|
|
4772
|
+
function resolveRequestedModel(modelName) {
|
|
4773
|
+
const requested = modelName ?? process.env.GRAPHIFY_WHISPER_MODEL ?? DEFAULT_MODEL;
|
|
4774
|
+
const resolved = MODEL_ALIASES[requested] ?? requested;
|
|
4775
|
+
if (!SUPPORTED_MODELS.has(resolved)) {
|
|
4776
|
+
throw new Error(
|
|
4777
|
+
`Unsupported GRAPHIFY_WHISPER_MODEL "${requested}". Supported local TS models: ${[...SUPPORTED_MODELS].sort().join(", ")}`
|
|
4778
|
+
);
|
|
4779
|
+
}
|
|
4780
|
+
return { requested, resolved };
|
|
4781
|
+
}
|
|
4782
|
+
function walkFiles(dir) {
|
|
4783
|
+
if (!existsSync6(dir)) return [];
|
|
4784
|
+
const files = [];
|
|
4785
|
+
for (const entry of readdirSync4(dir, { withFileTypes: true })) {
|
|
4786
|
+
const fullPath = join4(dir, entry.name);
|
|
4787
|
+
if (entry.isDirectory()) {
|
|
4788
|
+
files.push(...walkFiles(fullPath));
|
|
4789
|
+
} else {
|
|
4790
|
+
files.push(fullPath);
|
|
4791
|
+
}
|
|
4792
|
+
}
|
|
4793
|
+
return files;
|
|
4794
|
+
}
|
|
4795
|
+
function findArtifactsIn(dir) {
|
|
4796
|
+
const files = walkFiles(dir);
|
|
4797
|
+
const encoderPath = files.find((path) => path.endsWith("-encoder.int8.onnx")) ?? files.find((path) => path.endsWith("-encoder.onnx"));
|
|
4798
|
+
const decoderPath = files.find((path) => path.endsWith("-decoder.int8.onnx")) ?? files.find((path) => path.endsWith("-decoder.onnx"));
|
|
4799
|
+
const tokensPath = files.find((path) => path.endsWith("-tokens.txt"));
|
|
4800
|
+
if (!encoderPath || !decoderPath || !tokensPath) {
|
|
4801
|
+
return null;
|
|
4802
|
+
}
|
|
4803
|
+
return {
|
|
4804
|
+
modelDir: dir,
|
|
4805
|
+
encoderPath,
|
|
4806
|
+
decoderPath,
|
|
4807
|
+
tokensPath
|
|
4808
|
+
};
|
|
4809
|
+
}
|
|
4810
|
+
function normalizeModelError(detail) {
|
|
4811
|
+
if (detail.includes("404")) {
|
|
4812
|
+
return `${detail}. The local sherpa-onnx release asset was not found for this Whisper model name.`;
|
|
4813
|
+
}
|
|
4814
|
+
return detail;
|
|
4815
|
+
}
|
|
4816
|
+
async function writeResponseToFile(response, destination) {
|
|
4817
|
+
if (!response.ok || !response.body) {
|
|
4818
|
+
throw new Error(`HTTP ${response.status} while downloading ${response.url}`);
|
|
4819
|
+
}
|
|
4820
|
+
await pipeline(Readable.fromWeb(response.body), createWriteStream(destination));
|
|
4821
|
+
}
|
|
4822
|
+
async function ensureWhisperArtifacts(modelName) {
|
|
4823
|
+
const { requested, resolved } = resolveRequestedModel(modelName);
|
|
4824
|
+
const cacheRoot = defaultWhisperCacheDir();
|
|
4825
|
+
mkdirSync3(cacheRoot, { recursive: true });
|
|
4826
|
+
const modelDir = join4(cacheRoot, `sherpa-onnx-whisper-${resolved}`);
|
|
4827
|
+
const cached = findArtifactsIn(modelDir);
|
|
4828
|
+
if (cached) {
|
|
4829
|
+
return { requestedModel: requested, resolvedModel: resolved, ...cached };
|
|
4830
|
+
}
|
|
4831
|
+
const tempDir = mkdtempSync(join4(tmpdir(), "graphify-whisper-model-"));
|
|
4832
|
+
const extractDir = join4(tempDir, "extract");
|
|
4833
|
+
const archiveName = `sherpa-onnx-whisper-${resolved}.tar.bz2`;
|
|
4834
|
+
const archivePath = join4(tempDir, archiveName);
|
|
4835
|
+
mkdirSync3(extractDir, { recursive: true });
|
|
4836
|
+
try {
|
|
4837
|
+
const url = `${SHERPA_RELEASE_BASE}/${archiveName}`;
|
|
4838
|
+
console.log(` downloading whisper model: ${resolved}`);
|
|
4839
|
+
const response = await fetch(url);
|
|
4840
|
+
await writeResponseToFile(response, archivePath);
|
|
4841
|
+
runCommand(tarBinary(), ["-xjf", archivePath, "-C", extractDir]);
|
|
4842
|
+
const extractedRoot = walkFiles(extractDir).map((path) => dirname3(path)).find((path) => findArtifactsIn(path) !== null);
|
|
4843
|
+
const sourceDir = extractedRoot ?? readdirSync4(extractDir, { withFileTypes: true }).filter((entry) => entry.isDirectory()).map((entry) => join4(extractDir, entry.name)).find((path) => findArtifactsIn(path) !== null);
|
|
4844
|
+
if (!sourceDir) {
|
|
4845
|
+
throw new Error(`Downloaded archive for ${resolved} but could not locate Whisper model files`);
|
|
4846
|
+
}
|
|
4847
|
+
if (existsSync6(modelDir)) {
|
|
4848
|
+
rmSync(modelDir, { recursive: true, force: true });
|
|
4849
|
+
}
|
|
4850
|
+
try {
|
|
4851
|
+
renameSync2(sourceDir, modelDir);
|
|
4852
|
+
} catch {
|
|
4853
|
+
cpSync(sourceDir, modelDir, { recursive: true });
|
|
4854
|
+
}
|
|
4855
|
+
const artifacts = findArtifactsIn(modelDir);
|
|
4856
|
+
if (!artifacts) {
|
|
4857
|
+
throw new Error(`Model cache for ${resolved} is incomplete after extraction`);
|
|
4858
|
+
}
|
|
4859
|
+
return { requestedModel: requested, resolvedModel: resolved, ...artifacts };
|
|
4860
|
+
} catch (error) {
|
|
4861
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
4862
|
+
throw new Error(normalizeModelError(detail));
|
|
4863
|
+
} finally {
|
|
4864
|
+
rmSync(tempDir, { recursive: true, force: true });
|
|
4865
|
+
}
|
|
4866
|
+
}
|
|
4867
|
+
async function loadSherpaModule() {
|
|
4868
|
+
if (!sherpaModulePromise) {
|
|
4869
|
+
sherpaModulePromise = import("sherpa-onnx-node").then((imported) => Reflect.has(imported, "default") ? Reflect.get(imported, "default") : imported).catch((error) => {
|
|
4870
|
+
sherpaModulePromise = null;
|
|
4871
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
4872
|
+
throw new Error(
|
|
4873
|
+
`Video transcription requires the optional dependency sherpa-onnx-node. Install it locally, then retry. ${detail}`
|
|
4874
|
+
);
|
|
4875
|
+
});
|
|
4876
|
+
}
|
|
4877
|
+
return sherpaModulePromise;
|
|
4878
|
+
}
|
|
4879
|
+
async function getRecognizer(modelName, sherpa) {
|
|
4880
|
+
const artifacts = await ensureWhisperArtifacts(modelName);
|
|
4881
|
+
const cacheKey = artifacts.modelDir;
|
|
4882
|
+
const existing = recognizerCache.get(cacheKey);
|
|
4883
|
+
if (existing) {
|
|
4884
|
+
return { recognizer: await existing, artifacts };
|
|
4885
|
+
}
|
|
4886
|
+
const createRecognizer = (async () => {
|
|
4887
|
+
const runtime = sherpa ?? await loadSherpaModule();
|
|
4888
|
+
return runtime.OfflineRecognizer.createAsync({
|
|
4889
|
+
featConfig: {
|
|
4890
|
+
sampleRate: AUDIO_SAMPLE_RATE,
|
|
4891
|
+
featureDim: 80
|
|
4892
|
+
},
|
|
4893
|
+
modelConfig: {
|
|
4894
|
+
whisper: {
|
|
4895
|
+
encoder: artifacts.encoderPath,
|
|
4896
|
+
decoder: artifacts.decoderPath,
|
|
4897
|
+
task: "transcribe"
|
|
4898
|
+
},
|
|
4899
|
+
tokens: artifacts.tokensPath,
|
|
4900
|
+
numThreads: 1,
|
|
4901
|
+
provider: "cpu",
|
|
4902
|
+
debug: 0
|
|
4903
|
+
}
|
|
4904
|
+
});
|
|
4905
|
+
})();
|
|
4906
|
+
recognizerCache.set(
|
|
4907
|
+
cacheKey,
|
|
4908
|
+
createRecognizer.catch((error) => {
|
|
4909
|
+
recognizerCache.delete(cacheKey);
|
|
4910
|
+
throw error;
|
|
4911
|
+
})
|
|
4912
|
+
);
|
|
4913
|
+
return { recognizer: await recognizerCache.get(cacheKey), artifacts };
|
|
4914
|
+
}
|
|
4915
|
+
function normalizeToWave(audioPath, workingDir) {
|
|
4916
|
+
const wavPath = join4(workingDir, `${basename3(audioPath, extname4(audioPath))}.wav`);
|
|
4917
|
+
try {
|
|
4918
|
+
runCommand(ffmpegBinary(), [
|
|
4919
|
+
"-y",
|
|
4920
|
+
"-i",
|
|
4921
|
+
audioPath,
|
|
4922
|
+
"-vn",
|
|
4923
|
+
"-ac",
|
|
4924
|
+
"1",
|
|
4925
|
+
"-ar",
|
|
4926
|
+
String(AUDIO_SAMPLE_RATE),
|
|
4927
|
+
"-c:a",
|
|
4928
|
+
"pcm_s16le",
|
|
4929
|
+
wavPath
|
|
4930
|
+
]);
|
|
4931
|
+
} catch (error) {
|
|
4932
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
4933
|
+
throw new Error(
|
|
4934
|
+
`Video transcription requires ffmpeg in PATH. Install ffmpeg locally, then retry. ${detail}`
|
|
4935
|
+
);
|
|
4936
|
+
}
|
|
4937
|
+
return wavPath;
|
|
4938
|
+
}
|
|
4939
|
+
function extractTranscriptText(result) {
|
|
4940
|
+
return String(result.text ?? "").trim();
|
|
4941
|
+
}
|
|
4942
|
+
function isUrl(pathLike) {
|
|
4943
|
+
return URL_PREFIXES.some((prefix) => pathLike.startsWith(prefix));
|
|
4944
|
+
}
|
|
4945
|
+
function downloadAudio(url, outputDir) {
|
|
4946
|
+
mkdirSync3(outputDir, { recursive: true });
|
|
4947
|
+
const urlHash = createHash3("sha1").update(url).digest("hex").slice(0, 12);
|
|
4948
|
+
for (const ext of CACHED_AUDIO_EXTENSIONS) {
|
|
4949
|
+
const candidate = join4(outputDir, `yt_${urlHash}${ext}`);
|
|
4950
|
+
if (existsSync6(candidate)) {
|
|
4951
|
+
console.log(` cached audio: ${basename3(candidate)}`);
|
|
4952
|
+
return candidate;
|
|
4953
|
+
}
|
|
4954
|
+
}
|
|
4955
|
+
const outTemplate = join4(outputDir, `yt_${urlHash}.%(ext)s`);
|
|
4956
|
+
try {
|
|
4957
|
+
console.log(` downloading audio: ${url.slice(0, 80)} ...`);
|
|
4958
|
+
runCommand("yt-dlp", [
|
|
4959
|
+
"-f",
|
|
4960
|
+
"bestaudio[ext=m4a]/bestaudio/best",
|
|
4961
|
+
"-o",
|
|
4962
|
+
outTemplate,
|
|
4963
|
+
"--quiet",
|
|
4964
|
+
"--no-warnings",
|
|
4965
|
+
"--no-playlist",
|
|
4966
|
+
url
|
|
4967
|
+
]);
|
|
4968
|
+
} catch (error) {
|
|
4969
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
4970
|
+
throw new Error(
|
|
4971
|
+
`YouTube/URL download requires yt-dlp. Install yt-dlp to enable video ingestion. ${detail}`
|
|
4972
|
+
);
|
|
4973
|
+
}
|
|
4974
|
+
for (const entry of readdirSync4(outputDir)) {
|
|
4975
|
+
if (entry.startsWith(`yt_${urlHash}.`)) {
|
|
4976
|
+
return join4(outputDir, entry);
|
|
4977
|
+
}
|
|
4978
|
+
}
|
|
4979
|
+
throw new Error(`yt-dlp finished without producing an audio file for ${url}`);
|
|
4980
|
+
}
|
|
4981
|
+
function buildWhisperPrompt(godNodes2) {
|
|
4982
|
+
const override = process.env.GRAPHIFY_WHISPER_PROMPT;
|
|
4983
|
+
if (override) return override;
|
|
4984
|
+
const labels = godNodes2.map((node) => node.label ?? "").filter((label) => Boolean(label)).slice(0, 5);
|
|
4985
|
+
if (labels.length === 0) {
|
|
4986
|
+
return FALLBACK_PROMPT;
|
|
4987
|
+
}
|
|
4988
|
+
return `Technical discussion about ${labels.join(", ")}. ${FALLBACK_PROMPT}`;
|
|
4989
|
+
}
|
|
4990
|
+
async function transcribe(videoPath, outputDir = TRANSCRIPTS_DIR, initialPrompt, force = false) {
|
|
4991
|
+
const outDir = resolve5(outputDir);
|
|
4992
|
+
mkdirSync3(outDir, { recursive: true });
|
|
4993
|
+
const audioPath = isUrl(videoPath) ? downloadAudio(videoPath, join4(outDir, "downloads")) : resolve5(videoPath);
|
|
4994
|
+
const transcriptPath = join4(outDir, `${basename3(audioPath, extname4(audioPath))}.txt`);
|
|
4995
|
+
if (existsSync6(transcriptPath) && !force) {
|
|
4996
|
+
return transcriptPath;
|
|
4997
|
+
}
|
|
4998
|
+
const prompt = initialPrompt ?? process.env.GRAPHIFY_WHISPER_PROMPT ?? FALLBACK_PROMPT;
|
|
4999
|
+
const requestedModel = process.env.GRAPHIFY_WHISPER_MODEL ?? DEFAULT_MODEL;
|
|
5000
|
+
const tempDir = mkdtempSync(join4(tmpdir(), "graphify-transcribe-"));
|
|
5001
|
+
try {
|
|
5002
|
+
console.log(` transcribing ${basename3(audioPath)} (model=${requestedModel}) ...`);
|
|
5003
|
+
const wavPath = normalizeToWave(audioPath, tempDir);
|
|
5004
|
+
const sherpa = await loadSherpaModule();
|
|
5005
|
+
const { recognizer, artifacts } = await getRecognizer(requestedModel, sherpa);
|
|
5006
|
+
const wave = sherpa.readWave(wavPath);
|
|
5007
|
+
const stream = recognizer.createStream();
|
|
5008
|
+
if (prompt && typeof stream.setOption === "function") {
|
|
5009
|
+
try {
|
|
5010
|
+
stream.setOption("prompt", prompt);
|
|
5011
|
+
} catch {
|
|
5012
|
+
}
|
|
5013
|
+
}
|
|
5014
|
+
stream.acceptWaveform({ samples: wave.samples, sampleRate: wave.sampleRate });
|
|
5015
|
+
const result = await recognizer.decodeAsync(stream);
|
|
5016
|
+
const transcript = extractTranscriptText(result);
|
|
5017
|
+
writeFileSync4(transcriptPath, transcript, "utf-8");
|
|
5018
|
+
if (artifacts.requestedModel !== artifacts.resolvedModel) {
|
|
5019
|
+
console.log(` model alias: ${artifacts.requestedModel} -> ${artifacts.resolvedModel}`);
|
|
5020
|
+
}
|
|
5021
|
+
} catch (error) {
|
|
5022
|
+
if (error instanceof Error && error.message.startsWith("Unsupported GRAPHIFY_WHISPER_MODEL")) {
|
|
5023
|
+
throw error;
|
|
5024
|
+
}
|
|
5025
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
5026
|
+
throw new Error(
|
|
5027
|
+
`Video transcription requires the local TypeScript toolchain: sherpa-onnx-node + ffmpeg. Retry after installing them. ${detail}`
|
|
5028
|
+
);
|
|
5029
|
+
} finally {
|
|
5030
|
+
rmSync(tempDir, { recursive: true, force: true });
|
|
5031
|
+
}
|
|
5032
|
+
return transcriptPath;
|
|
5033
|
+
}
|
|
5034
|
+
async function transcribeAll(videoFiles, outputDir, initialPrompt, force = false) {
|
|
5035
|
+
if (videoFiles.length === 0) {
|
|
5036
|
+
return [];
|
|
5037
|
+
}
|
|
5038
|
+
const transcriptPaths = [];
|
|
5039
|
+
for (const videoFile of videoFiles) {
|
|
5040
|
+
try {
|
|
5041
|
+
transcriptPaths.push(await transcribe(videoFile, outputDir, initialPrompt, force));
|
|
5042
|
+
} catch (error) {
|
|
5043
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
5044
|
+
console.log(` warning: could not transcribe ${videoFile}: ${detail}`);
|
|
5045
|
+
}
|
|
5046
|
+
}
|
|
5047
|
+
return transcriptPaths;
|
|
5048
|
+
}
|
|
5049
|
+
function cloneDetection(detection) {
|
|
5050
|
+
return JSON.parse(JSON.stringify(detection));
|
|
5051
|
+
}
|
|
5052
|
+
async function augmentDetectionWithTranscripts(detection, options) {
|
|
5053
|
+
const nextDetection = cloneDetection(detection);
|
|
5054
|
+
const source = options?.incremental && nextDetection.new_files ? nextDetection.new_files : nextDetection.files;
|
|
5055
|
+
const videoFiles = [...source.video ?? []];
|
|
5056
|
+
const prompt = options?.initialPrompt ?? buildWhisperPrompt(options?.godNodes ?? []);
|
|
5057
|
+
if (videoFiles.length === 0) {
|
|
5058
|
+
return { detection: nextDetection, transcriptPaths: [], prompt };
|
|
5059
|
+
}
|
|
5060
|
+
const previousModel = process.env.GRAPHIFY_WHISPER_MODEL;
|
|
5061
|
+
if (options?.whisperModel) {
|
|
5062
|
+
process.env.GRAPHIFY_WHISPER_MODEL = options.whisperModel;
|
|
5063
|
+
}
|
|
5064
|
+
try {
|
|
5065
|
+
const transcriptPaths = await transcribeAll(
|
|
5066
|
+
videoFiles,
|
|
5067
|
+
options?.outputDir,
|
|
5068
|
+
prompt,
|
|
5069
|
+
options?.incremental === true
|
|
5070
|
+
);
|
|
5071
|
+
const existingDocuments = source.document ?? [];
|
|
5072
|
+
source.document = [...existingDocuments, ...transcriptPaths];
|
|
5073
|
+
return { detection: nextDetection, transcriptPaths, prompt };
|
|
5074
|
+
} finally {
|
|
5075
|
+
if (options?.whisperModel) {
|
|
5076
|
+
if (previousModel === void 0) {
|
|
5077
|
+
delete process.env.GRAPHIFY_WHISPER_MODEL;
|
|
5078
|
+
} else {
|
|
5079
|
+
process.env.GRAPHIFY_WHISPER_MODEL = previousModel;
|
|
5080
|
+
}
|
|
5081
|
+
}
|
|
5082
|
+
}
|
|
5083
|
+
}
|
|
5084
|
+
|
|
5085
|
+
// src/ingest.ts
|
|
4599
5086
|
function yamlStr(s) {
|
|
4600
5087
|
return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, " ").replace(/\r/g, " ");
|
|
4601
5088
|
}
|
|
@@ -4763,7 +5250,7 @@ async function downloadBinary(url, suffix, targetDir) {
|
|
|
4763
5250
|
const filename = safeFilename(url, suffix);
|
|
4764
5251
|
const outPath = pathResolve2(targetDir, filename);
|
|
4765
5252
|
const data = await safeFetch(url);
|
|
4766
|
-
|
|
5253
|
+
writeFileSync5(outPath, data);
|
|
4767
5254
|
return outPath;
|
|
4768
5255
|
}
|
|
4769
5256
|
function normalizeIngestOptions(authorOrOptions, contributor) {
|
|
@@ -4779,7 +5266,7 @@ function normalizeIngestOptions(authorOrOptions, contributor) {
|
|
|
4779
5266
|
};
|
|
4780
5267
|
}
|
|
4781
5268
|
async function ingest(url, targetDir, authorOrOptions = null, contributor = null) {
|
|
4782
|
-
|
|
5269
|
+
mkdirSync4(targetDir, { recursive: true });
|
|
4783
5270
|
const urlType = detectUrlType(url);
|
|
4784
5271
|
const { author, contributor: normalizedContributor } = normalizeIngestOptions(
|
|
4785
5272
|
authorOrOptions,
|
|
@@ -4790,7 +5277,7 @@ async function ingest(url, targetDir, authorOrOptions = null, contributor = null
|
|
|
4790
5277
|
let filename;
|
|
4791
5278
|
if (urlType === "pdf") {
|
|
4792
5279
|
const out = await downloadBinary(url, ".pdf", targetDir);
|
|
4793
|
-
console.log(`Downloaded PDF: ${
|
|
5280
|
+
console.log(`Downloaded PDF: ${basename4(out)}`);
|
|
4794
5281
|
return out;
|
|
4795
5282
|
}
|
|
4796
5283
|
if (urlType === "image") {
|
|
@@ -4800,9 +5287,14 @@ async function ingest(url, targetDir, authorOrOptions = null, contributor = null
|
|
|
4800
5287
|
} catch {
|
|
4801
5288
|
throw new Error(`Invalid URL: ${url}`);
|
|
4802
5289
|
}
|
|
4803
|
-
const suffix =
|
|
5290
|
+
const suffix = extname5(parsed.pathname) || ".jpg";
|
|
4804
5291
|
const out = await downloadBinary(url, suffix, targetDir);
|
|
4805
|
-
console.log(`Downloaded image: ${
|
|
5292
|
+
console.log(`Downloaded image: ${basename4(out)}`);
|
|
5293
|
+
return out;
|
|
5294
|
+
}
|
|
5295
|
+
if (urlType === "youtube") {
|
|
5296
|
+
const out = downloadAudio(url, targetDir);
|
|
5297
|
+
console.log(`Downloaded audio: ${basename4(out)}`);
|
|
4806
5298
|
return out;
|
|
4807
5299
|
}
|
|
4808
5300
|
if (urlType === "tweet") {
|
|
@@ -4814,13 +5306,13 @@ async function ingest(url, targetDir, authorOrOptions = null, contributor = null
|
|
|
4814
5306
|
}
|
|
4815
5307
|
let outPath = pathResolve2(targetDir, filename);
|
|
4816
5308
|
let counter = 1;
|
|
4817
|
-
while (
|
|
5309
|
+
while (existsSync7(outPath)) {
|
|
4818
5310
|
const stem = filename.replace(/\.md$/, "");
|
|
4819
5311
|
outPath = pathResolve2(targetDir, `${stem}_${counter}.md`);
|
|
4820
5312
|
counter++;
|
|
4821
5313
|
}
|
|
4822
|
-
|
|
4823
|
-
console.log(`Saved ${urlType}: ${
|
|
5314
|
+
writeFileSync5(outPath, content, "utf-8");
|
|
5315
|
+
console.log(`Saved ${urlType}: ${basename4(outPath)}`);
|
|
4824
5316
|
return outPath;
|
|
4825
5317
|
}
|
|
4826
5318
|
function saveQueryResult(questionOrOptions, answer, memoryDir, queryType = "query", sourceNodes = null) {
|
|
@@ -4840,7 +5332,7 @@ function saveQueryResult(questionOrOptions, answer, memoryDir, queryType = "quer
|
|
|
4840
5332
|
if (!payload.question) throw new Error("saveQueryResult requires a question");
|
|
4841
5333
|
if (!payload.memoryDir) throw new Error("saveQueryResult requires a memoryDir");
|
|
4842
5334
|
const effectiveAnswer = payload.answer ?? "";
|
|
4843
|
-
|
|
5335
|
+
mkdirSync4(payload.memoryDir, { recursive: true });
|
|
4844
5336
|
const now = /* @__PURE__ */ new Date();
|
|
4845
5337
|
const slug = payload.question.toLowerCase().replace(/[^\w]/g, "_").slice(0, 50).replace(/_+$/, "");
|
|
4846
5338
|
const ts = now.toISOString().replace(/[-:]/g, "").replace("T", "_").slice(0, 15);
|
|
@@ -4873,10 +5365,10 @@ function saveQueryResult(questionOrOptions, answer, memoryDir, queryType = "quer
|
|
|
4873
5365
|
}
|
|
4874
5366
|
const content = [...frontmatterLines, ...bodyLines].join("\n");
|
|
4875
5367
|
const outPath = pathResolve2(payload.memoryDir, filename);
|
|
4876
|
-
|
|
5368
|
+
writeFileSync5(outPath, content, "utf-8");
|
|
4877
5369
|
return outPath;
|
|
4878
5370
|
}
|
|
4879
|
-
var isDirectExecution = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^ingest\.(?:js|mjs|cjs|ts)$/.test(
|
|
5371
|
+
var isDirectExecution = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^ingest\.(?:js|mjs|cjs|ts)$/.test(basename4(process.argv[1]));
|
|
4880
5372
|
if (isDirectExecution) {
|
|
4881
5373
|
const url = process.argv[2];
|
|
4882
5374
|
const targetDir = process.argv[3] ?? "./raw";
|
|
@@ -5047,18 +5539,18 @@ function generate(G, communities, cohesionScores, communityLabels, godNodeList,
|
|
|
5047
5539
|
|
|
5048
5540
|
// src/skill-runtime.ts
|
|
5049
5541
|
var __filename = fileURLToPath(import.meta.url);
|
|
5050
|
-
var __dirname =
|
|
5542
|
+
var __dirname = dirname4(__filename);
|
|
5051
5543
|
function readJson(path) {
|
|
5052
|
-
return JSON.parse(readFileSync5(
|
|
5544
|
+
return JSON.parse(readFileSync5(resolve7(path), "utf-8"));
|
|
5053
5545
|
}
|
|
5054
5546
|
function writeJson(path, value) {
|
|
5055
|
-
const resolved =
|
|
5056
|
-
|
|
5057
|
-
|
|
5547
|
+
const resolved = resolve7(path);
|
|
5548
|
+
mkdirSync5(dirname4(resolved), { recursive: true });
|
|
5549
|
+
writeFileSync6(resolved, JSON.stringify(value, null, 2), "utf-8");
|
|
5058
5550
|
}
|
|
5059
5551
|
function getVersion() {
|
|
5060
5552
|
try {
|
|
5061
|
-
const pkg = JSON.parse(readFileSync5(
|
|
5553
|
+
const pkg = JSON.parse(readFileSync5(join5(__dirname, "..", "package.json"), "utf-8"));
|
|
5062
5554
|
return pkg.version ?? "unknown";
|
|
5063
5555
|
} catch {
|
|
5064
5556
|
return "unknown";
|
|
@@ -5094,23 +5586,10 @@ function ensureExtractionShape(value) {
|
|
|
5094
5586
|
}
|
|
5095
5587
|
function loadGraph2(graphPath) {
|
|
5096
5588
|
const raw = readJson(graphPath);
|
|
5097
|
-
|
|
5098
|
-
|
|
5099
|
-
|
|
5100
|
-
|
|
5101
|
-
}
|
|
5102
|
-
for (const link of raw.links ?? []) {
|
|
5103
|
-
const { source, target, ...attrs } = link;
|
|
5104
|
-
if (!G.hasNode(source) || !G.hasNode(target)) continue;
|
|
5105
|
-
try {
|
|
5106
|
-
G.mergeEdge(source, target, attrs);
|
|
5107
|
-
} catch {
|
|
5108
|
-
}
|
|
5109
|
-
}
|
|
5110
|
-
if (raw.hyperedges && raw.hyperedges.length > 0) {
|
|
5111
|
-
G.setAttribute("hyperedges", raw.hyperedges);
|
|
5112
|
-
}
|
|
5113
|
-
return G;
|
|
5589
|
+
return loadGraphFromData(raw);
|
|
5590
|
+
}
|
|
5591
|
+
function shouldBuildDirected(opts, existingGraph) {
|
|
5592
|
+
return opts.directed === true || (existingGraph ? isDirectedGraph(existingGraph) : false);
|
|
5114
5593
|
}
|
|
5115
5594
|
function mergeHyperedges(existing = [], incoming = []) {
|
|
5116
5595
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -5182,11 +5661,11 @@ function analyzeGraph(G, detection, root, tokenCost, labelsOverride) {
|
|
|
5182
5661
|
}
|
|
5183
5662
|
function placeholderDetection(root = ".") {
|
|
5184
5663
|
return {
|
|
5185
|
-
files: { code: [], document: [], paper: [], image: [] },
|
|
5664
|
+
files: { code: [], document: [], paper: [], image: [], video: [] },
|
|
5186
5665
|
total_files: 0,
|
|
5187
5666
|
total_words: 0,
|
|
5188
5667
|
needs_graph: true,
|
|
5189
|
-
warning: `Reused existing graph at ${
|
|
5668
|
+
warning: `Reused existing graph at ${resolve7(root)} without re-running corpus detection.`,
|
|
5190
5669
|
skipped_sensitive: [],
|
|
5191
5670
|
graphifyignore_patterns: 0
|
|
5192
5671
|
};
|
|
@@ -5234,8 +5713,8 @@ function updateCostFile(extractionInput, detection, outPath) {
|
|
|
5234
5713
|
total_input_tokens: 0,
|
|
5235
5714
|
total_output_tokens: 0
|
|
5236
5715
|
};
|
|
5237
|
-
const resolved =
|
|
5238
|
-
if (
|
|
5716
|
+
const resolved = resolve7(outPath);
|
|
5717
|
+
if (existsSync8(resolved)) {
|
|
5239
5718
|
cost = readJson(resolved);
|
|
5240
5719
|
}
|
|
5241
5720
|
const input = extraction.input_tokens ?? 0;
|
|
@@ -5270,8 +5749,8 @@ function runtimeInfo() {
|
|
|
5270
5749
|
version: getVersion(),
|
|
5271
5750
|
node: process.execPath,
|
|
5272
5751
|
script: __filename,
|
|
5273
|
-
module:
|
|
5274
|
-
cli:
|
|
5752
|
+
module: join5(__dirname, "index.js"),
|
|
5753
|
+
cli: join5(__dirname, "cli.js")
|
|
5275
5754
|
};
|
|
5276
5755
|
}
|
|
5277
5756
|
async function main() {
|
|
@@ -5281,23 +5760,37 @@ async function main() {
|
|
|
5281
5760
|
console.log(JSON.stringify(runtimeInfo(), null, 2));
|
|
5282
5761
|
});
|
|
5283
5762
|
program.command("detect").argument("<inputPath>").option("--out <path>").action((inputPath, opts) => {
|
|
5284
|
-
const result = detect(
|
|
5763
|
+
const result = detect(resolve7(inputPath));
|
|
5285
5764
|
if (opts.out) {
|
|
5286
5765
|
writeJson(opts.out, result);
|
|
5287
|
-
console.log(`Detected ${result.total_files} files in ${
|
|
5766
|
+
console.log(`Detected ${result.total_files} files in ${resolve7(inputPath)}`);
|
|
5288
5767
|
} else {
|
|
5289
5768
|
console.log(JSON.stringify(result, null, 2));
|
|
5290
5769
|
}
|
|
5291
5770
|
});
|
|
5292
5771
|
program.command("detect-incremental").argument("<inputPath>").option("--manifest <path>", "Path to manifest.json", "graphify-out/manifest.json").option("--out <path>").action((inputPath, opts) => {
|
|
5293
|
-
const result = detectIncremental(
|
|
5772
|
+
const result = detectIncremental(resolve7(inputPath), resolve7(opts.manifest));
|
|
5294
5773
|
if (opts.out) {
|
|
5295
5774
|
writeJson(opts.out, result);
|
|
5296
|
-
console.log(`${result.new_total ?? 0} new/changed file(s) under ${
|
|
5775
|
+
console.log(`${result.new_total ?? 0} new/changed file(s) under ${resolve7(inputPath)}`);
|
|
5297
5776
|
} else {
|
|
5298
5777
|
console.log(JSON.stringify(result, null, 2));
|
|
5299
5778
|
}
|
|
5300
5779
|
});
|
|
5780
|
+
program.command("prepare-semantic-detect").requiredOption("--detect <path>", "Path to the base detection JSON").requiredOption("--out <path>", "Path to the augmented semantic detection JSON").requiredOption("--transcripts-out <path>", "Path to the transcript path list JSON").option("--analysis <path>", "Optional analysis JSON from a previous run").option("--incremental", "Use detection.new_files.video and force retranscription").option("--whisper-model <name>", "Whisper model override for local transcription").action(async (opts) => {
|
|
5781
|
+
const detection = readJson(opts.detect);
|
|
5782
|
+
const analysis = opts.analysis && existsSync8(resolve7(opts.analysis)) ? readJson(opts.analysis) : null;
|
|
5783
|
+
const transcriptsDir = join5(dirname4(resolve7(opts.out)), "transcripts");
|
|
5784
|
+
const { detection: semanticDetection, transcriptPaths } = await augmentDetectionWithTranscripts(detection, {
|
|
5785
|
+
outputDir: transcriptsDir,
|
|
5786
|
+
godNodes: analysis?.gods,
|
|
5787
|
+
incremental: opts.incremental,
|
|
5788
|
+
whisperModel: opts.whisperModel
|
|
5789
|
+
});
|
|
5790
|
+
writeJson(opts.out, semanticDetection);
|
|
5791
|
+
writeJson(opts.transcriptsOut, transcriptPaths);
|
|
5792
|
+
console.log(`Transcribed ${transcriptPaths.length} video file(s) -> treating as docs`);
|
|
5793
|
+
});
|
|
5301
5794
|
program.command("extract-ast").requiredOption("--detect <path>", "Path to detection JSON").requiredOption("--out <path>", "Path to AST extraction JSON").option("--incremental", "Use detection.new_files.code instead of detection.files.code").action(async (opts) => {
|
|
5302
5795
|
const detection = readJson(opts.detect);
|
|
5303
5796
|
const codeFiles = opts.incremental ? detection.new_files?.code ?? [] : detection.files.code ?? [];
|
|
@@ -5325,15 +5818,15 @@ async function main() {
|
|
|
5325
5818
|
];
|
|
5326
5819
|
const [cachedNodes, cachedEdges, cachedHyperedges, uncached] = checkSemanticCache(
|
|
5327
5820
|
allFiles,
|
|
5328
|
-
|
|
5821
|
+
resolve7(opts.root)
|
|
5329
5822
|
);
|
|
5330
5823
|
writeJson(opts.cachedOut, {
|
|
5331
5824
|
nodes: cachedNodes,
|
|
5332
5825
|
edges: cachedEdges,
|
|
5333
5826
|
hyperedges: cachedHyperedges
|
|
5334
5827
|
});
|
|
5335
|
-
|
|
5336
|
-
|
|
5828
|
+
mkdirSync5(dirname4(resolve7(opts.uncachedOut)), { recursive: true });
|
|
5829
|
+
writeFileSync6(resolve7(opts.uncachedOut), uncached.join("\n"), "utf-8");
|
|
5337
5830
|
console.log(`Cache: ${allFiles.length - uncached.length} files hit, ${uncached.length} files need extraction`);
|
|
5338
5831
|
});
|
|
5339
5832
|
program.command("save-semantic-cache").requiredOption("--input <path>", "Path to semantic extraction JSON").option("--root <path>", "Graph root for cache resolution", ".").action((opts) => {
|
|
@@ -5342,7 +5835,7 @@ async function main() {
|
|
|
5342
5835
|
extraction.nodes,
|
|
5343
5836
|
extraction.edges,
|
|
5344
5837
|
extraction.hyperedges ?? [],
|
|
5345
|
-
|
|
5838
|
+
resolve7(opts.root)
|
|
5346
5839
|
);
|
|
5347
5840
|
console.log(`Cached ${saved} files`);
|
|
5348
5841
|
});
|
|
@@ -5390,11 +5883,11 @@ async function main() {
|
|
|
5390
5883
|
`Merged: ${merged.nodes.length} nodes, ${merged.edges.length} edges (${ast.nodes.length} AST + ${semantic.nodes.length} semantic)`
|
|
5391
5884
|
);
|
|
5392
5885
|
});
|
|
5393
|
-
program.command("finalize-build").requiredOption("--detect <path>").requiredOption("--ast <path>").requiredOption("--root <path>").requiredOption("--graph-out <path>").requiredOption("--report-out <path>").requiredOption("--analysis-out <path>").requiredOption("--cost-out <path>").option("--cached <path>", "Optional cached semantic JSON").option("--semantic-new <path>", "Optional fresh semantic JSON").option("--html-out <path>", "Optional graph.html output path").action((opts) => {
|
|
5886
|
+
program.command("finalize-build").requiredOption("--detect <path>").requiredOption("--ast <path>").requiredOption("--root <path>").requiredOption("--graph-out <path>").requiredOption("--report-out <path>").requiredOption("--analysis-out <path>").requiredOption("--cost-out <path>").option("--directed", "Build a directed graph (preserves source->target)").option("--cached <path>", "Optional cached semantic JSON").option("--semantic-new <path>", "Optional fresh semantic JSON").option("--html-out <path>", "Optional graph.html output path").action((opts) => {
|
|
5394
5887
|
const detection = readJson(opts.detect);
|
|
5395
5888
|
const ast = ensureExtractionShape(readJson(opts.ast));
|
|
5396
|
-
const cached = opts.cached &&
|
|
5397
|
-
const semanticNew = opts.semanticNew &&
|
|
5889
|
+
const cached = opts.cached && existsSync8(resolve7(opts.cached)) ? readJson(opts.cached) : null;
|
|
5890
|
+
const semanticNew = opts.semanticNew && existsSync8(resolve7(opts.semanticNew)) ? readJson(opts.semanticNew) : null;
|
|
5398
5891
|
if (semanticNew) {
|
|
5399
5892
|
saveSemanticCache(
|
|
5400
5893
|
semanticNew.nodes,
|
|
@@ -5405,35 +5898,37 @@ async function main() {
|
|
|
5405
5898
|
}
|
|
5406
5899
|
const semantic = mergeSemanticArtifacts(cached, semanticNew);
|
|
5407
5900
|
const extraction = mergeAstAndSemantic(ast, semantic);
|
|
5408
|
-
const G = buildFromJson(extraction);
|
|
5901
|
+
const G = buildFromJson(extraction, { directed: shouldBuildDirected(opts) });
|
|
5409
5902
|
if (G.order === 0) {
|
|
5410
5903
|
throw new Error("Graph is empty - extraction produced no nodes.");
|
|
5411
5904
|
}
|
|
5412
5905
|
const analyzed = analyzeGraph(
|
|
5413
5906
|
G,
|
|
5414
5907
|
detection,
|
|
5415
|
-
|
|
5908
|
+
resolve7(opts.root),
|
|
5416
5909
|
{ input: extraction.input_tokens ?? 0, output: extraction.output_tokens ?? 0 }
|
|
5417
5910
|
);
|
|
5418
|
-
toJson(G, analyzed.communities,
|
|
5419
|
-
|
|
5911
|
+
toJson(G, analyzed.communities, resolve7(opts.graphOut), {
|
|
5912
|
+
communityLabels: analyzed.labels
|
|
5913
|
+
});
|
|
5914
|
+
writeFileSync6(resolve7(opts.reportOut), analyzed.report, "utf-8");
|
|
5420
5915
|
writeJson(opts.analysisOut, analyzed.analysis);
|
|
5421
5916
|
if (opts.htmlOut) {
|
|
5422
|
-
toHtml(G, analyzed.communities,
|
|
5917
|
+
toHtml(G, analyzed.communities, resolve7(opts.htmlOut), {
|
|
5423
5918
|
communityLabels: analyzed.labels
|
|
5424
5919
|
});
|
|
5425
5920
|
}
|
|
5426
|
-
saveManifest(detection.files,
|
|
5921
|
+
saveManifest(detection.files, join5(dirname4(resolve7(opts.graphOut)), "manifest.json"));
|
|
5427
5922
|
const cost = updateCostFile(extraction, detection, opts.costOut);
|
|
5428
5923
|
console.log(`Graph: ${G.order} nodes, ${G.size} edges, ${analyzed.communities.size} communities`);
|
|
5429
5924
|
console.log(`This run: ${(extraction.input_tokens ?? 0).toLocaleString()} input tokens, ${(extraction.output_tokens ?? 0).toLocaleString()} output tokens`);
|
|
5430
5925
|
console.log(`All time: ${cost.total_input_tokens.toLocaleString()} input, ${cost.total_output_tokens.toLocaleString()} output (${cost.runs.length} runs)`);
|
|
5431
5926
|
});
|
|
5432
|
-
program.command("finalize-update").requiredOption("--detect <path>").requiredOption("--ast <path>").requiredOption("--existing-graph <path>").requiredOption("--root <path>").requiredOption("--graph-out <path>").requiredOption("--report-out <path>").requiredOption("--analysis-out <path>").requiredOption("--cost-out <path>").option("--cached <path>", "Optional cached semantic JSON").option("--semantic-new <path>", "Optional fresh semantic JSON").option("--html-out <path>", "Optional graph.html output path").action((opts) => {
|
|
5927
|
+
program.command("finalize-update").requiredOption("--detect <path>").requiredOption("--ast <path>").requiredOption("--existing-graph <path>").requiredOption("--root <path>").requiredOption("--graph-out <path>").requiredOption("--report-out <path>").requiredOption("--analysis-out <path>").requiredOption("--cost-out <path>").option("--directed", "Build a directed graph (preserves source->target)").option("--cached <path>", "Optional cached semantic JSON").option("--semantic-new <path>", "Optional fresh semantic JSON").option("--html-out <path>", "Optional graph.html output path").action((opts) => {
|
|
5433
5928
|
const detection = readJson(opts.detect);
|
|
5434
5929
|
const ast = ensureExtractionShape(readJson(opts.ast));
|
|
5435
|
-
const cached = opts.cached &&
|
|
5436
|
-
const semanticNew = opts.semanticNew &&
|
|
5930
|
+
const cached = opts.cached && existsSync8(resolve7(opts.cached)) ? readJson(opts.cached) : null;
|
|
5931
|
+
const semanticNew = opts.semanticNew && existsSync8(resolve7(opts.semanticNew)) ? readJson(opts.semanticNew) : null;
|
|
5437
5932
|
if (semanticNew) {
|
|
5438
5933
|
saveSemanticCache(
|
|
5439
5934
|
semanticNew.nodes,
|
|
@@ -5446,35 +5941,39 @@ async function main() {
|
|
|
5446
5941
|
const extraction = mergeAstAndSemantic(ast, semantic);
|
|
5447
5942
|
const oldGraph = loadGraph2(opts.existingGraph);
|
|
5448
5943
|
const mergedGraph = loadGraph2(opts.existingGraph);
|
|
5449
|
-
const newGraph = buildFromJson(extraction
|
|
5944
|
+
const newGraph = buildFromJson(extraction, {
|
|
5945
|
+
directed: shouldBuildDirected(opts, oldGraph)
|
|
5946
|
+
});
|
|
5450
5947
|
mergeGraphs(mergedGraph, newGraph);
|
|
5451
5948
|
const analyzed = analyzeGraph(
|
|
5452
5949
|
mergedGraph,
|
|
5453
5950
|
detection,
|
|
5454
|
-
|
|
5951
|
+
resolve7(opts.root),
|
|
5455
5952
|
{ input: extraction.input_tokens ?? 0, output: extraction.output_tokens ?? 0 }
|
|
5456
5953
|
);
|
|
5457
5954
|
analyzed.analysis.diff = graphDiff(oldGraph, mergedGraph);
|
|
5458
|
-
toJson(mergedGraph, analyzed.communities,
|
|
5459
|
-
|
|
5955
|
+
toJson(mergedGraph, analyzed.communities, resolve7(opts.graphOut), {
|
|
5956
|
+
communityLabels: analyzed.labels
|
|
5957
|
+
});
|
|
5958
|
+
writeFileSync6(resolve7(opts.reportOut), analyzed.report, "utf-8");
|
|
5460
5959
|
writeJson(opts.analysisOut, analyzed.analysis);
|
|
5461
5960
|
if (opts.htmlOut) {
|
|
5462
|
-
toHtml(mergedGraph, analyzed.communities,
|
|
5961
|
+
toHtml(mergedGraph, analyzed.communities, resolve7(opts.htmlOut), {
|
|
5463
5962
|
communityLabels: analyzed.labels
|
|
5464
5963
|
});
|
|
5465
5964
|
}
|
|
5466
|
-
saveManifest(detection.files,
|
|
5965
|
+
saveManifest(detection.files, join5(dirname4(resolve7(opts.graphOut)), "manifest.json"));
|
|
5467
5966
|
const cost = updateCostFile(extraction, detection, opts.costOut);
|
|
5468
5967
|
console.log(`Merged: ${mergedGraph.order} nodes, ${mergedGraph.size} edges`);
|
|
5469
5968
|
console.log(analyzed.analysis.diff.summary);
|
|
5470
5969
|
console.log(`This run: ${(extraction.input_tokens ?? 0).toLocaleString()} input tokens, ${(extraction.output_tokens ?? 0).toLocaleString()} output tokens`);
|
|
5471
5970
|
console.log(`All time: ${cost.total_input_tokens.toLocaleString()} input, ${cost.total_output_tokens.toLocaleString()} output (${cost.runs.length} runs)`);
|
|
5472
5971
|
});
|
|
5473
|
-
program.command("analyze-build").requiredOption("--extract <path>").requiredOption("--detect <path>").requiredOption("--root <path>").requiredOption("--graph-out <path>").requiredOption("--report-out <path>").requiredOption("--analysis-out <path>").action((opts) => {
|
|
5972
|
+
program.command("analyze-build").requiredOption("--extract <path>").requiredOption("--detect <path>").requiredOption("--root <path>").requiredOption("--graph-out <path>").requiredOption("--report-out <path>").requiredOption("--analysis-out <path>").option("--directed", "Build a directed graph (preserves source->target)").action((opts) => {
|
|
5474
5973
|
const extraction = ensureExtractionShape(readJson(opts.extract));
|
|
5475
5974
|
const detection = readJson(opts.detect);
|
|
5476
|
-
const root =
|
|
5477
|
-
const G = buildFromJson(extraction);
|
|
5975
|
+
const root = resolve7(opts.root);
|
|
5976
|
+
const G = buildFromJson(extraction, { directed: shouldBuildDirected(opts) });
|
|
5478
5977
|
if (G.order === 0) {
|
|
5479
5978
|
throw new Error("Graph is empty - extraction produced no nodes.");
|
|
5480
5979
|
}
|
|
@@ -5484,20 +5983,22 @@ async function main() {
|
|
|
5484
5983
|
root,
|
|
5485
5984
|
{ input: extraction.input_tokens ?? 0, output: extraction.output_tokens ?? 0 }
|
|
5486
5985
|
);
|
|
5487
|
-
|
|
5488
|
-
toJson(G, analyzed.communities,
|
|
5489
|
-
|
|
5986
|
+
mkdirSync5(dirname4(resolve7(opts.graphOut)), { recursive: true });
|
|
5987
|
+
toJson(G, analyzed.communities, resolve7(opts.graphOut), {
|
|
5988
|
+
communityLabels: analyzed.labels
|
|
5989
|
+
});
|
|
5990
|
+
writeFileSync6(resolve7(opts.reportOut), analyzed.report, "utf-8");
|
|
5490
5991
|
writeJson(opts.analysisOut, analyzed.analysis);
|
|
5491
|
-
saveManifest(detection.files,
|
|
5992
|
+
saveManifest(detection.files, join5(dirname4(resolve7(opts.graphOut)), "manifest.json"));
|
|
5492
5993
|
console.log(`Graph: ${G.order} nodes, ${G.size} edges, ${analyzed.communities.size} communities`);
|
|
5493
5994
|
});
|
|
5494
|
-
program.command("write-labeled-report").requiredOption("--extract <path>").requiredOption("--detect <path>").requiredOption("--analysis <path>").requiredOption("--labels <path>").requiredOption("--root <path>").requiredOption("--report-out <path>").action((opts) => {
|
|
5995
|
+
program.command("write-labeled-report").requiredOption("--extract <path>").requiredOption("--detect <path>").requiredOption("--analysis <path>").requiredOption("--labels <path>").requiredOption("--root <path>").requiredOption("--report-out <path>").option("--directed", "Build a directed graph (preserves source->target)").option("--graph-out <path>").option("--html-out <path>").action((opts) => {
|
|
5495
5996
|
const extraction = ensureExtractionShape(readJson(opts.extract));
|
|
5496
5997
|
const detection = readJson(opts.detect);
|
|
5497
5998
|
const analysis = readJson(opts.analysis);
|
|
5498
5999
|
const labelObject = readJson(opts.labels);
|
|
5499
6000
|
const labels = objectToStringMap(labelObject);
|
|
5500
|
-
const G = buildFromJson(extraction);
|
|
6001
|
+
const G = buildFromJson(extraction, { directed: shouldBuildDirected(opts) });
|
|
5501
6002
|
const communities = new Map(
|
|
5502
6003
|
Object.entries(analysis.communities).map(([key, value]) => [Number.parseInt(key, 10), value])
|
|
5503
6004
|
);
|
|
@@ -5514,57 +6015,63 @@ async function main() {
|
|
|
5514
6015
|
analysis.surprises,
|
|
5515
6016
|
detection,
|
|
5516
6017
|
{ input: extraction.input_tokens ?? 0, output: extraction.output_tokens ?? 0 },
|
|
5517
|
-
|
|
6018
|
+
resolve7(opts.root),
|
|
5518
6019
|
questions
|
|
5519
6020
|
);
|
|
5520
6021
|
analysis.questions = questions;
|
|
5521
6022
|
analysis.labels = mapToObject(labels);
|
|
5522
|
-
|
|
6023
|
+
writeFileSync6(resolve7(opts.reportOut), report, "utf-8");
|
|
6024
|
+
if (opts.graphOut) {
|
|
6025
|
+
toJson(G, communities, resolve7(opts.graphOut), { communityLabels: labels });
|
|
6026
|
+
}
|
|
6027
|
+
if (opts.htmlOut) {
|
|
6028
|
+
toHtml(G, communities, resolve7(opts.htmlOut), { communityLabels: labels });
|
|
6029
|
+
}
|
|
5523
6030
|
writeJson(opts.analysis, analysis);
|
|
5524
|
-
console.log("
|
|
6031
|
+
console.log("Labeled artifacts updated");
|
|
5525
6032
|
});
|
|
5526
|
-
program.command("export-html").requiredOption("--extract <path>").requiredOption("--analysis <path>").option("--labels <path>").requiredOption("--out <path>").action((opts) => {
|
|
6033
|
+
program.command("export-html").requiredOption("--extract <path>").requiredOption("--analysis <path>").option("--labels <path>").requiredOption("--out <path>").option("--directed", "Build a directed graph (preserves source->target)").action((opts) => {
|
|
5527
6034
|
const extraction = ensureExtractionShape(readJson(opts.extract));
|
|
5528
6035
|
const analysis = readJson(opts.analysis);
|
|
5529
6036
|
const labels = opts.labels ? objectToStringMap(readJson(opts.labels)) : objectToStringMap(analysis.labels);
|
|
5530
6037
|
const communities = new Map(
|
|
5531
6038
|
Object.entries(analysis.communities).map(([key, value]) => [Number.parseInt(key, 10), value])
|
|
5532
6039
|
);
|
|
5533
|
-
const G = buildFromJson(extraction);
|
|
5534
|
-
toHtml(G, communities,
|
|
6040
|
+
const G = buildFromJson(extraction, { directed: shouldBuildDirected(opts) });
|
|
6041
|
+
toHtml(G, communities, resolve7(opts.out), { communityLabels: labels });
|
|
5535
6042
|
console.log("graph.html written - open in any browser, no server needed");
|
|
5536
6043
|
});
|
|
5537
|
-
program.command("export-svg").requiredOption("--extract <path>").requiredOption("--analysis <path>").option("--labels <path>").requiredOption("--out <path>").action((opts) => {
|
|
6044
|
+
program.command("export-svg").requiredOption("--extract <path>").requiredOption("--analysis <path>").option("--labels <path>").requiredOption("--out <path>").option("--directed", "Build a directed graph (preserves source->target)").action((opts) => {
|
|
5538
6045
|
const extraction = ensureExtractionShape(readJson(opts.extract));
|
|
5539
6046
|
const analysis = readJson(opts.analysis);
|
|
5540
6047
|
const labels = opts.labels ? objectToStringMap(readJson(opts.labels)) : objectToStringMap(analysis.labels);
|
|
5541
6048
|
const communities = new Map(
|
|
5542
6049
|
Object.entries(analysis.communities).map(([key, value]) => [Number.parseInt(key, 10), value])
|
|
5543
6050
|
);
|
|
5544
|
-
const G = buildFromJson(extraction);
|
|
5545
|
-
toSvg(G, communities,
|
|
6051
|
+
const G = buildFromJson(extraction, { directed: shouldBuildDirected(opts) });
|
|
6052
|
+
toSvg(G, communities, resolve7(opts.out), labels);
|
|
5546
6053
|
console.log("graph.svg written - embeds in Obsidian, Notion, GitHub READMEs");
|
|
5547
6054
|
});
|
|
5548
|
-
program.command("export-graphml").requiredOption("--extract <path>").requiredOption("--analysis <path>").requiredOption("--out <path>").action((opts) => {
|
|
6055
|
+
program.command("export-graphml").requiredOption("--extract <path>").requiredOption("--analysis <path>").requiredOption("--out <path>").option("--directed", "Build a directed graph (preserves source->target)").action((opts) => {
|
|
5549
6056
|
const extraction = ensureExtractionShape(readJson(opts.extract));
|
|
5550
6057
|
const analysis = readJson(opts.analysis);
|
|
5551
6058
|
const communities = new Map(
|
|
5552
6059
|
Object.entries(analysis.communities).map(([key, value]) => [Number.parseInt(key, 10), value])
|
|
5553
6060
|
);
|
|
5554
|
-
const G = buildFromJson(extraction);
|
|
5555
|
-
toGraphml(G, communities,
|
|
6061
|
+
const G = buildFromJson(extraction, { directed: shouldBuildDirected(opts) });
|
|
6062
|
+
toGraphml(G, communities, resolve7(opts.out));
|
|
5556
6063
|
console.log("graph.graphml written - open in Gephi, yEd, or any GraphML tool");
|
|
5557
6064
|
});
|
|
5558
|
-
program.command("export-cypher").requiredOption("--extract <path>").requiredOption("--out <path>").action((opts) => {
|
|
6065
|
+
program.command("export-cypher").requiredOption("--extract <path>").requiredOption("--out <path>").option("--directed", "Build a directed graph (preserves source->target)").action((opts) => {
|
|
5559
6066
|
const extraction = ensureExtractionShape(readJson(opts.extract));
|
|
5560
|
-
const G = buildFromJson(extraction);
|
|
5561
|
-
toCypher(G,
|
|
6067
|
+
const G = buildFromJson(extraction, { directed: shouldBuildDirected(opts) });
|
|
6068
|
+
toCypher(G, resolve7(opts.out));
|
|
5562
6069
|
console.log("cypher.txt written - import with: cypher-shell < graphify-out/cypher.txt");
|
|
5563
6070
|
});
|
|
5564
|
-
program.command("push-neo4j").requiredOption("--extract <path>").requiredOption("--analysis <path>").requiredOption("--uri <uri>").requiredOption("--user <user>").requiredOption("--password <password>").action(async (opts) => {
|
|
6071
|
+
program.command("push-neo4j").requiredOption("--extract <path>").requiredOption("--analysis <path>").requiredOption("--uri <uri>").requiredOption("--user <user>").requiredOption("--password <password>").option("--directed", "Build a directed graph (preserves source->target)").action(async (opts) => {
|
|
5565
6072
|
const extraction = ensureExtractionShape(readJson(opts.extract));
|
|
5566
6073
|
const analysis = readJson(opts.analysis);
|
|
5567
|
-
const G = buildFromJson(extraction);
|
|
6074
|
+
const G = buildFromJson(extraction, { directed: shouldBuildDirected(opts) });
|
|
5568
6075
|
const communities = new Map(
|
|
5569
6076
|
Object.entries(analysis.communities).map(([key, value]) => [Number.parseInt(key, 10), value])
|
|
5570
6077
|
);
|
|
@@ -5578,19 +6085,19 @@ async function main() {
|
|
|
5578
6085
|
});
|
|
5579
6086
|
program.command("benchmark").requiredOption("--graph <path>").option("--corpus-words <n>").action((opts) => {
|
|
5580
6087
|
const corpusWords = opts.corpusWords ? Number.parseInt(opts.corpusWords, 10) : void 0;
|
|
5581
|
-
const result = runBenchmark(
|
|
6088
|
+
const result = runBenchmark(resolve7(opts.graph), corpusWords);
|
|
5582
6089
|
printBenchmark(result);
|
|
5583
6090
|
});
|
|
5584
6091
|
program.command("update-cost").requiredOption("--extract <path>").requiredOption("--detect <path>").requiredOption("--out <path>").action((opts) => {
|
|
5585
6092
|
const extraction = ensureExtractionShape(readJson(opts.extract));
|
|
5586
6093
|
const detection = readJson(opts.detect);
|
|
5587
|
-
const outPath =
|
|
6094
|
+
const outPath = resolve7(opts.out);
|
|
5588
6095
|
let cost = {
|
|
5589
6096
|
runs: [],
|
|
5590
6097
|
total_input_tokens: 0,
|
|
5591
6098
|
total_output_tokens: 0
|
|
5592
6099
|
};
|
|
5593
|
-
if (
|
|
6100
|
+
if (existsSync8(outPath)) {
|
|
5594
6101
|
cost = readJson(outPath);
|
|
5595
6102
|
}
|
|
5596
6103
|
const input = extraction.input_tokens ?? 0;
|
|
@@ -5609,24 +6116,28 @@ async function main() {
|
|
|
5609
6116
|
`All time: ${cost.total_input_tokens.toLocaleString()} input, ${cost.total_output_tokens.toLocaleString()} output (${cost.runs.length} runs)`
|
|
5610
6117
|
);
|
|
5611
6118
|
});
|
|
5612
|
-
program.command("merge-update").requiredOption("--existing-graph <path>").requiredOption("--extract <path>").requiredOption("--detect <path>").requiredOption("--root <path>").requiredOption("--graph-out <path>").requiredOption("--report-out <path>").requiredOption("--analysis-out <path>").action((opts) => {
|
|
6119
|
+
program.command("merge-update").requiredOption("--existing-graph <path>").requiredOption("--extract <path>").requiredOption("--detect <path>").requiredOption("--root <path>").requiredOption("--graph-out <path>").requiredOption("--report-out <path>").requiredOption("--analysis-out <path>").option("--directed", "Build a directed graph (preserves source->target)").action((opts) => {
|
|
5613
6120
|
const oldGraph = loadGraph2(opts.existingGraph);
|
|
5614
6121
|
const mergedGraph = loadGraph2(opts.existingGraph);
|
|
5615
6122
|
const extraction = ensureExtractionShape(readJson(opts.extract));
|
|
5616
6123
|
const detection = readJson(opts.detect);
|
|
5617
|
-
const newGraph = buildFromJson(extraction
|
|
6124
|
+
const newGraph = buildFromJson(extraction, {
|
|
6125
|
+
directed: shouldBuildDirected(opts, oldGraph)
|
|
6126
|
+
});
|
|
5618
6127
|
mergeGraphs(mergedGraph, newGraph);
|
|
5619
6128
|
const analyzed = analyzeGraph(
|
|
5620
6129
|
mergedGraph,
|
|
5621
6130
|
detection,
|
|
5622
|
-
|
|
6131
|
+
resolve7(opts.root),
|
|
5623
6132
|
{ input: extraction.input_tokens ?? 0, output: extraction.output_tokens ?? 0 }
|
|
5624
6133
|
);
|
|
5625
6134
|
analyzed.analysis.diff = graphDiff(oldGraph, mergedGraph);
|
|
5626
|
-
toJson(mergedGraph, analyzed.communities,
|
|
5627
|
-
|
|
6135
|
+
toJson(mergedGraph, analyzed.communities, resolve7(opts.graphOut), {
|
|
6136
|
+
communityLabels: analyzed.labels
|
|
6137
|
+
});
|
|
6138
|
+
writeFileSync6(resolve7(opts.reportOut), analyzed.report, "utf-8");
|
|
5628
6139
|
writeJson(opts.analysisOut, analyzed.analysis);
|
|
5629
|
-
saveManifest(detection.files,
|
|
6140
|
+
saveManifest(detection.files, join5(dirname4(resolve7(opts.graphOut)), "manifest.json"));
|
|
5630
6141
|
console.log(`Merged: ${mergedGraph.order} nodes, ${mergedGraph.size} edges`);
|
|
5631
6142
|
console.log(analyzed.analysis.diff.summary);
|
|
5632
6143
|
});
|
|
@@ -5635,11 +6146,13 @@ async function main() {
|
|
|
5635
6146
|
const analyzed = analyzeGraph(
|
|
5636
6147
|
G,
|
|
5637
6148
|
placeholderDetection(opts.root),
|
|
5638
|
-
|
|
6149
|
+
resolve7(opts.root),
|
|
5639
6150
|
{ input: 0, output: 0 }
|
|
5640
6151
|
);
|
|
5641
|
-
toJson(G, analyzed.communities,
|
|
5642
|
-
|
|
6152
|
+
toJson(G, analyzed.communities, resolve7(opts.graphOut), {
|
|
6153
|
+
communityLabels: analyzed.labels
|
|
6154
|
+
});
|
|
6155
|
+
writeFileSync6(resolve7(opts.reportOut), analyzed.report, "utf-8");
|
|
5643
6156
|
writeJson(opts.analysisOut, analyzed.analysis);
|
|
5644
6157
|
console.log(`Re-clustered: ${analyzed.communities.size} communities`);
|
|
5645
6158
|
});
|
|
@@ -5690,7 +6203,7 @@ async function main() {
|
|
|
5690
6203
|
console.log(` degree: ${G.degree(nodeId)}`);
|
|
5691
6204
|
console.log("");
|
|
5692
6205
|
console.log("CONNECTIONS:");
|
|
5693
|
-
G
|
|
6206
|
+
forEachTraversalNeighbor(G, nodeId, (neighbor) => {
|
|
5694
6207
|
const edgeId = G.edge(nodeId, neighbor);
|
|
5695
6208
|
const edge = edgeId ? G.getEdgeAttributes(edgeId) : {};
|
|
5696
6209
|
const label = G.getNodeAttribute(neighbor, "label") ?? neighbor;
|
|
@@ -5699,7 +6212,7 @@ async function main() {
|
|
|
5699
6212
|
});
|
|
5700
6213
|
});
|
|
5701
6214
|
program.command("ingest").argument("<url>").option("--target-dir <path>", "Directory to save fetched content", "./raw").option("--author <name>").option("--contributor <name>").action(async (url, opts) => {
|
|
5702
|
-
const outPath = await ingest(url,
|
|
6215
|
+
const outPath = await ingest(url, resolve7(opts.targetDir), {
|
|
5703
6216
|
author: opts.author ?? null,
|
|
5704
6217
|
contributor: opts.contributor ?? null
|
|
5705
6218
|
});
|
|
@@ -5709,7 +6222,7 @@ async function main() {
|
|
|
5709
6222
|
const outPath = saveQueryResult({
|
|
5710
6223
|
question: opts.question,
|
|
5711
6224
|
answer: opts.answer,
|
|
5712
|
-
memoryDir:
|
|
6225
|
+
memoryDir: resolve7(opts.memoryDir),
|
|
5713
6226
|
queryType: opts.queryType,
|
|
5714
6227
|
sourceNodes: JSON.parse(opts.sourceNodesJson)
|
|
5715
6228
|
});
|