graphifyy 0.3.17 → 0.3.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ja-JP.md +60 -17
- package/README.md +41 -13
- package/README.zh-CN.md +54 -17
- package/dist/cli.js +862 -369
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1070 -598
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +36 -6
- package/dist/index.d.ts +36 -6
- package/dist/index.js +1092 -614
- package/dist/index.js.map +1 -1
- package/dist/skill-runtime.js +1182 -669
- package/dist/skill-runtime.js.map +1 -1
- package/package.json +14 -4
- package/src/skills/skill-claw.md +1 -0
- package/src/skills/skill-codex.md +69 -11
- package/src/skills/skill-droid.md +73 -6
- package/src/skills/skill-gemini.toml +207 -0
- package/src/skills/skill-opencode.md +73 -6
- package/src/skills/skill-trae.md +1 -0
- package/src/skills/skill-windows.md +76 -5
- package/src/skills/skill.md +82 -8
package/dist/index.cjs
CHANGED
|
@@ -29,6 +29,21 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
29
29
|
));
|
|
30
30
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
31
31
|
|
|
32
|
+
// src/types.ts
|
|
33
|
+
var FileType;
|
|
34
|
+
var init_types = __esm({
|
|
35
|
+
"src/types.ts"() {
|
|
36
|
+
FileType = /* @__PURE__ */ ((FileType2) => {
|
|
37
|
+
FileType2["CODE"] = "code";
|
|
38
|
+
FileType2["DOCUMENT"] = "document";
|
|
39
|
+
FileType2["PAPER"] = "paper";
|
|
40
|
+
FileType2["IMAGE"] = "image";
|
|
41
|
+
FileType2["VIDEO"] = "video";
|
|
42
|
+
return FileType2;
|
|
43
|
+
})(FileType || {});
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
|
|
32
47
|
// src/validate.ts
|
|
33
48
|
function validateExtraction(data) {
|
|
34
49
|
if (typeof data !== "object" || data === null || Array.isArray(data)) {
|
|
@@ -118,13 +133,81 @@ var init_validate = __esm({
|
|
|
118
133
|
}
|
|
119
134
|
});
|
|
120
135
|
|
|
136
|
+
// src/graph.ts
|
|
137
|
+
function createGraph(directed = false) {
|
|
138
|
+
return new import_graphology.default({ type: directed ? "directed" : "undirected", multi: false });
|
|
139
|
+
}
|
|
140
|
+
function isDirectedGraph(G) {
|
|
141
|
+
return G.type === "directed";
|
|
142
|
+
}
|
|
143
|
+
function loadGraphFromData(raw) {
|
|
144
|
+
const G = createGraph(raw.directed === true);
|
|
145
|
+
for (const [key, value] of Object.entries(raw.graph ?? {})) {
|
|
146
|
+
G.setAttribute(key, value);
|
|
147
|
+
}
|
|
148
|
+
for (const node of raw.nodes ?? []) {
|
|
149
|
+
const { id, ...attrs } = node;
|
|
150
|
+
G.mergeNode(id, attrs);
|
|
151
|
+
}
|
|
152
|
+
for (const link of raw.links ?? raw.edges ?? []) {
|
|
153
|
+
const { source, target, ...attrs } = link;
|
|
154
|
+
if (!G.hasNode(source) || !G.hasNode(target)) continue;
|
|
155
|
+
try {
|
|
156
|
+
G.mergeEdge(source, target, attrs);
|
|
157
|
+
} catch {
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
if (raw.hyperedges && raw.hyperedges.length > 0) {
|
|
161
|
+
G.setAttribute("hyperedges", raw.hyperedges);
|
|
162
|
+
}
|
|
163
|
+
return G;
|
|
164
|
+
}
|
|
165
|
+
function toUndirectedGraph(G) {
|
|
166
|
+
if (!isDirectedGraph(G)) return G.copy();
|
|
167
|
+
const copy = createGraph(false);
|
|
168
|
+
for (const [key, value] of Object.entries(G.getAttributes())) {
|
|
169
|
+
copy.setAttribute(key, value);
|
|
170
|
+
}
|
|
171
|
+
G.forEachNode((nodeId, attrs) => {
|
|
172
|
+
copy.mergeNode(nodeId, attrs);
|
|
173
|
+
});
|
|
174
|
+
G.forEachEdge((_edge, attrs, source, target) => {
|
|
175
|
+
if (!copy.hasNode(source) || !copy.hasNode(target)) return;
|
|
176
|
+
try {
|
|
177
|
+
copy.mergeEdge(source, target, attrs);
|
|
178
|
+
} catch {
|
|
179
|
+
}
|
|
180
|
+
});
|
|
181
|
+
return copy;
|
|
182
|
+
}
|
|
183
|
+
function forEachTraversalNeighbor(G, node, callback) {
|
|
184
|
+
if (isDirectedGraph(G)) {
|
|
185
|
+
G.forEachOutboundNeighbor(node, callback);
|
|
186
|
+
return;
|
|
187
|
+
}
|
|
188
|
+
G.forEachNeighbor(node, callback);
|
|
189
|
+
}
|
|
190
|
+
function traversalNeighbors(G, node) {
|
|
191
|
+
const neighbors = [];
|
|
192
|
+
forEachTraversalNeighbor(G, node, (neighbor) => {
|
|
193
|
+
neighbors.push(neighbor);
|
|
194
|
+
});
|
|
195
|
+
return neighbors;
|
|
196
|
+
}
|
|
197
|
+
var import_graphology;
|
|
198
|
+
var init_graph = __esm({
|
|
199
|
+
"src/graph.ts"() {
|
|
200
|
+
import_graphology = __toESM(require("graphology"), 1);
|
|
201
|
+
}
|
|
202
|
+
});
|
|
203
|
+
|
|
121
204
|
// src/build.ts
|
|
122
205
|
var build_exports = {};
|
|
123
206
|
__export(build_exports, {
|
|
124
207
|
build: () => build,
|
|
125
208
|
buildFromJson: () => buildFromJson
|
|
126
209
|
});
|
|
127
|
-
function buildFromJson(extraction) {
|
|
210
|
+
function buildFromJson(extraction, options) {
|
|
128
211
|
const errors = validateExtraction(extraction);
|
|
129
212
|
const realErrors = errors.filter((e) => !e.includes("does not match any node id"));
|
|
130
213
|
if (realErrors.length > 0) {
|
|
@@ -132,7 +215,7 @@ function buildFromJson(extraction) {
|
|
|
132
215
|
`[graphify] Extraction warning (${realErrors.length} issues): ${realErrors[0]}`
|
|
133
216
|
);
|
|
134
217
|
}
|
|
135
|
-
const G =
|
|
218
|
+
const G = createGraph(options?.directed === true);
|
|
136
219
|
for (const node of extraction.nodes ?? []) {
|
|
137
220
|
const { id, ...attrs } = node;
|
|
138
221
|
G.mergeNode(id, attrs);
|
|
@@ -154,7 +237,7 @@ function buildFromJson(extraction) {
|
|
|
154
237
|
}
|
|
155
238
|
return G;
|
|
156
239
|
}
|
|
157
|
-
function build(extractions) {
|
|
240
|
+
function build(extractions, options) {
|
|
158
241
|
const combined = {
|
|
159
242
|
nodes: [],
|
|
160
243
|
edges: [],
|
|
@@ -169,12 +252,11 @@ function build(extractions) {
|
|
|
169
252
|
combined.input_tokens += ext.input_tokens ?? 0;
|
|
170
253
|
combined.output_tokens += ext.output_tokens ?? 0;
|
|
171
254
|
}
|
|
172
|
-
return buildFromJson(combined);
|
|
255
|
+
return buildFromJson(combined, options);
|
|
173
256
|
}
|
|
174
|
-
var import_graphology;
|
|
175
257
|
var init_build = __esm({
|
|
176
258
|
"src/build.ts"() {
|
|
177
|
-
|
|
259
|
+
init_graph();
|
|
178
260
|
init_validate();
|
|
179
261
|
}
|
|
180
262
|
});
|
|
@@ -214,7 +296,7 @@ __export(cluster_exports, {
|
|
|
214
296
|
scoreAll: () => scoreAll
|
|
215
297
|
});
|
|
216
298
|
function partition(G) {
|
|
217
|
-
const result = (0, import_graphology_communities_louvain.default)(G);
|
|
299
|
+
const result = (0, import_graphology_communities_louvain.default)(G.type === "directed" ? toUndirectedGraph(G) : G);
|
|
218
300
|
const map = /* @__PURE__ */ new Map();
|
|
219
301
|
for (const [node, cid] of Object.entries(result)) {
|
|
220
302
|
map.set(node, cid);
|
|
@@ -321,11 +403,370 @@ var init_cluster = __esm({
|
|
|
321
403
|
"src/cluster.ts"() {
|
|
322
404
|
import_graphology_communities_louvain = __toESM(require("graphology-communities-louvain"), 1);
|
|
323
405
|
init_collections();
|
|
406
|
+
init_graph();
|
|
324
407
|
MAX_COMMUNITY_FRACTION = 0.25;
|
|
325
408
|
MIN_SPLIT_SIZE = 10;
|
|
326
409
|
}
|
|
327
410
|
});
|
|
328
411
|
|
|
412
|
+
// src/detect.ts
|
|
413
|
+
function isSensitive(filePath) {
|
|
414
|
+
const name = (0, import_node_path.basename)(filePath);
|
|
415
|
+
return SENSITIVE_PATTERNS.some((p) => p.test(name) || p.test(filePath));
|
|
416
|
+
}
|
|
417
|
+
function looksLikePaper(filePath) {
|
|
418
|
+
try {
|
|
419
|
+
const text = (0, import_node_fs.readFileSync)(filePath, "utf-8").slice(0, 3e3);
|
|
420
|
+
const hits = PAPER_SIGNALS.filter((p) => p.test(text)).length;
|
|
421
|
+
return hits >= PAPER_SIGNAL_THRESHOLD;
|
|
422
|
+
} catch {
|
|
423
|
+
return false;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
function classifyFile(filePath) {
|
|
427
|
+
const ext = (0, import_node_path.extname)(filePath).toLowerCase();
|
|
428
|
+
if (CODE_EXTENSIONS.has(ext)) return "code" /* CODE */;
|
|
429
|
+
if (PAPER_EXTENSIONS.has(ext)) {
|
|
430
|
+
const parts = filePath.split(import_node_path.sep);
|
|
431
|
+
if (parts.some((p) => [...ASSET_DIR_MARKERS].some((m) => p.endsWith(m)))) return null;
|
|
432
|
+
return "paper" /* PAPER */;
|
|
433
|
+
}
|
|
434
|
+
if (IMAGE_EXTENSIONS.has(ext)) return "image" /* IMAGE */;
|
|
435
|
+
if (VIDEO_EXTENSIONS.has(ext)) return "video" /* VIDEO */;
|
|
436
|
+
if (DOC_EXTENSIONS.has(ext)) {
|
|
437
|
+
if (looksLikePaper(filePath)) return "paper" /* PAPER */;
|
|
438
|
+
return "document" /* DOCUMENT */;
|
|
439
|
+
}
|
|
440
|
+
if (OFFICE_EXTENSIONS.has(ext)) return "document" /* DOCUMENT */;
|
|
441
|
+
return null;
|
|
442
|
+
}
|
|
443
|
+
function countWords(filePath) {
|
|
444
|
+
try {
|
|
445
|
+
const text = (0, import_node_fs.readFileSync)(filePath, "utf-8");
|
|
446
|
+
return text.split(/\s+/).filter(Boolean).length;
|
|
447
|
+
} catch {
|
|
448
|
+
return 0;
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
function isNoiseDir(part) {
|
|
452
|
+
if (SKIP_DIRS.has(part)) return true;
|
|
453
|
+
if (part.endsWith("_venv") || part.endsWith("_env")) return true;
|
|
454
|
+
if (part.endsWith(".egg-info")) return true;
|
|
455
|
+
return false;
|
|
456
|
+
}
|
|
457
|
+
function loadGraphifyignore(root) {
|
|
458
|
+
const patterns = [];
|
|
459
|
+
let current = (0, import_node_path.resolve)(root);
|
|
460
|
+
while (true) {
|
|
461
|
+
const ignoreFile = (0, import_node_path.join)(current, ".graphifyignore");
|
|
462
|
+
if ((0, import_node_fs.existsSync)(ignoreFile)) {
|
|
463
|
+
for (let line of (0, import_node_fs.readFileSync)(ignoreFile, "utf-8").split("\n")) {
|
|
464
|
+
line = line.trim();
|
|
465
|
+
if (line && !line.startsWith("#")) {
|
|
466
|
+
patterns.push(line);
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
if ((0, import_node_fs.existsSync)((0, import_node_path.join)(current, ".git"))) {
|
|
471
|
+
break;
|
|
472
|
+
}
|
|
473
|
+
const parent = (0, import_node_path.dirname)(current);
|
|
474
|
+
if (parent === current) {
|
|
475
|
+
break;
|
|
476
|
+
}
|
|
477
|
+
current = parent;
|
|
478
|
+
}
|
|
479
|
+
return patterns;
|
|
480
|
+
}
|
|
481
|
+
function matchGlob(text, pattern) {
|
|
482
|
+
const regex = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
483
|
+
return new RegExp(`^${regex}$`).test(text);
|
|
484
|
+
}
|
|
485
|
+
function isIgnored(filePath, root, patterns) {
|
|
486
|
+
if (patterns.length === 0) return false;
|
|
487
|
+
let rel;
|
|
488
|
+
try {
|
|
489
|
+
rel = (0, import_node_path.relative)(root, filePath).replace(/\\/g, "/");
|
|
490
|
+
} catch {
|
|
491
|
+
return false;
|
|
492
|
+
}
|
|
493
|
+
const parts = rel.split("/");
|
|
494
|
+
for (const pattern of patterns) {
|
|
495
|
+
const p = pattern.replace(/^\/+|\/+$/g, "");
|
|
496
|
+
if (!p) continue;
|
|
497
|
+
if (matchGlob(rel, p)) return true;
|
|
498
|
+
if (matchGlob((0, import_node_path.basename)(filePath), p)) return true;
|
|
499
|
+
for (let i = 0; i < parts.length; i++) {
|
|
500
|
+
if (matchGlob(parts[i], p)) return true;
|
|
501
|
+
if (matchGlob(parts.slice(0, i + 1).join("/"), p)) return true;
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
return false;
|
|
505
|
+
}
|
|
506
|
+
function walkDir(dir, root, ignorePatterns, followSymlinks, skipPrune) {
|
|
507
|
+
const result = [];
|
|
508
|
+
let entries;
|
|
509
|
+
try {
|
|
510
|
+
entries = (0, import_node_fs.readdirSync)(dir);
|
|
511
|
+
} catch {
|
|
512
|
+
return result;
|
|
513
|
+
}
|
|
514
|
+
for (const entry of entries) {
|
|
515
|
+
const full = (0, import_node_path.join)(dir, entry);
|
|
516
|
+
let stat;
|
|
517
|
+
try {
|
|
518
|
+
stat = followSymlinks ? (0, import_node_fs.statSync)(full) : (0, import_node_fs.lstatSync)(full);
|
|
519
|
+
} catch {
|
|
520
|
+
continue;
|
|
521
|
+
}
|
|
522
|
+
if (stat.isDirectory()) {
|
|
523
|
+
if (!skipPrune) {
|
|
524
|
+
if (entry.startsWith(".")) continue;
|
|
525
|
+
if (isNoiseDir(entry)) continue;
|
|
526
|
+
if (isIgnored(full, root, ignorePatterns)) continue;
|
|
527
|
+
}
|
|
528
|
+
result.push(...walkDir(full, root, ignorePatterns, followSymlinks, skipPrune));
|
|
529
|
+
} else if (stat.isFile()) {
|
|
530
|
+
result.push(full);
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
return result;
|
|
534
|
+
}
|
|
535
|
+
function detect(root, options) {
|
|
536
|
+
const followSymlinks = options?.followSymlinks ?? false;
|
|
537
|
+
const rootResolved = (0, import_node_path.resolve)(root);
|
|
538
|
+
const ignorePatterns = loadGraphifyignore(rootResolved);
|
|
539
|
+
const convertedDir = (0, import_node_path.join)(rootResolved, "graphify-out", "converted");
|
|
540
|
+
const memoryDir = (0, import_node_path.join)(rootResolved, "graphify-out", "memory");
|
|
541
|
+
const files = {
|
|
542
|
+
code: [],
|
|
543
|
+
document: [],
|
|
544
|
+
paper: [],
|
|
545
|
+
image: [],
|
|
546
|
+
video: []
|
|
547
|
+
};
|
|
548
|
+
let totalWords = 0;
|
|
549
|
+
const skippedSensitive = [];
|
|
550
|
+
const allFiles = walkDir(rootResolved, rootResolved, ignorePatterns, followSymlinks, false);
|
|
551
|
+
if ((0, import_node_fs.existsSync)(memoryDir)) {
|
|
552
|
+
allFiles.push(...walkDir(memoryDir, rootResolved, ignorePatterns, followSymlinks, true));
|
|
553
|
+
}
|
|
554
|
+
const seen = /* @__PURE__ */ new Set();
|
|
555
|
+
for (const p of allFiles) {
|
|
556
|
+
if (seen.has(p)) continue;
|
|
557
|
+
seen.add(p);
|
|
558
|
+
const inMemory = (0, import_node_fs.existsSync)(memoryDir) && p.startsWith(memoryDir);
|
|
559
|
+
if (!inMemory) {
|
|
560
|
+
if ((0, import_node_path.basename)(p).startsWith(".")) continue;
|
|
561
|
+
if (p.startsWith(convertedDir)) continue;
|
|
562
|
+
}
|
|
563
|
+
if (isIgnored(p, rootResolved, ignorePatterns)) continue;
|
|
564
|
+
if (isSensitive(p)) {
|
|
565
|
+
skippedSensitive.push(p);
|
|
566
|
+
continue;
|
|
567
|
+
}
|
|
568
|
+
const ftype = classifyFile(p);
|
|
569
|
+
if (!ftype) continue;
|
|
570
|
+
if (OFFICE_EXTENSIONS.has((0, import_node_path.extname)(p).toLowerCase())) {
|
|
571
|
+
skippedSensitive.push(p + " [office conversion requires async - use pipeline]");
|
|
572
|
+
continue;
|
|
573
|
+
}
|
|
574
|
+
files[ftype].push(p);
|
|
575
|
+
if (ftype !== "video" /* VIDEO */) {
|
|
576
|
+
totalWords += countWords(p);
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
const totalFiles = Object.values(files).reduce((s, v) => s + v.length, 0);
|
|
580
|
+
const needsGraph = totalWords >= CORPUS_WARN_THRESHOLD;
|
|
581
|
+
let warning = null;
|
|
582
|
+
if (!needsGraph) {
|
|
583
|
+
warning = `Corpus is ~${totalWords.toLocaleString()} words - fits in a single context window. You may not need a graph.`;
|
|
584
|
+
} else if (totalWords >= CORPUS_UPPER_THRESHOLD || totalFiles >= FILE_COUNT_UPPER) {
|
|
585
|
+
warning = `Large corpus: ${totalFiles} files \xB7 ~${totalWords.toLocaleString()} words. Semantic extraction will be expensive (many Claude tokens). Consider running on a subfolder, or use --no-semantic to run AST-only.`;
|
|
586
|
+
}
|
|
587
|
+
return {
|
|
588
|
+
files,
|
|
589
|
+
total_files: totalFiles,
|
|
590
|
+
total_words: totalWords,
|
|
591
|
+
needs_graph: needsGraph,
|
|
592
|
+
warning,
|
|
593
|
+
skipped_sensitive: skippedSensitive,
|
|
594
|
+
graphifyignore_patterns: ignorePatterns.length
|
|
595
|
+
};
|
|
596
|
+
}
|
|
597
|
+
function loadManifest(manifestPath = MANIFEST_PATH) {
|
|
598
|
+
try {
|
|
599
|
+
return JSON.parse((0, import_node_fs.readFileSync)(manifestPath, "utf-8"));
|
|
600
|
+
} catch {
|
|
601
|
+
return {};
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
function saveManifest(files, manifestPath = MANIFEST_PATH) {
|
|
605
|
+
const manifest = {};
|
|
606
|
+
for (const fileList of Object.values(files)) {
|
|
607
|
+
for (const f of fileList) {
|
|
608
|
+
try {
|
|
609
|
+
manifest[f] = (0, import_node_fs.statSync)(f).mtimeMs;
|
|
610
|
+
} catch {
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
const dir = (0, import_node_path.join)(manifestPath, "..");
|
|
615
|
+
(0, import_node_fs.mkdirSync)(dir, { recursive: true });
|
|
616
|
+
(0, import_node_fs.writeFileSync)(manifestPath, JSON.stringify(manifest, null, 2));
|
|
617
|
+
}
|
|
618
|
+
function detectIncremental(root, manifestPath = MANIFEST_PATH) {
|
|
619
|
+
const full = detect(root);
|
|
620
|
+
const manifest = loadManifest(manifestPath);
|
|
621
|
+
if (Object.keys(manifest).length === 0) {
|
|
622
|
+
return {
|
|
623
|
+
...full,
|
|
624
|
+
incremental: true,
|
|
625
|
+
new_files: full.files,
|
|
626
|
+
unchanged_files: Object.fromEntries(Object.keys(full.files).map((k) => [k, []])),
|
|
627
|
+
new_total: full.total_files
|
|
628
|
+
};
|
|
629
|
+
}
|
|
630
|
+
const newFiles = {};
|
|
631
|
+
const unchangedFiles = {};
|
|
632
|
+
for (const k of Object.keys(full.files)) {
|
|
633
|
+
newFiles[k] = [];
|
|
634
|
+
unchangedFiles[k] = [];
|
|
635
|
+
}
|
|
636
|
+
for (const [ftype, fileList] of Object.entries(full.files)) {
|
|
637
|
+
for (const f of fileList) {
|
|
638
|
+
const storedMtime = manifest[f];
|
|
639
|
+
let currentMtime = 0;
|
|
640
|
+
try {
|
|
641
|
+
currentMtime = (0, import_node_fs.statSync)(f).mtimeMs;
|
|
642
|
+
} catch {
|
|
643
|
+
}
|
|
644
|
+
if (storedMtime === void 0 || currentMtime > storedMtime) {
|
|
645
|
+
newFiles[ftype].push(f);
|
|
646
|
+
} else {
|
|
647
|
+
unchangedFiles[ftype].push(f);
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
const currentFiles = new Set(Object.values(full.files).flat());
|
|
652
|
+
const deletedFiles = Object.keys(manifest).filter((f) => !currentFiles.has(f));
|
|
653
|
+
const newTotal = Object.values(newFiles).reduce((s, v) => s + v.length, 0);
|
|
654
|
+
return {
|
|
655
|
+
...full,
|
|
656
|
+
incremental: true,
|
|
657
|
+
new_files: newFiles,
|
|
658
|
+
unchanged_files: unchangedFiles,
|
|
659
|
+
new_total: newTotal,
|
|
660
|
+
deleted_files: deletedFiles
|
|
661
|
+
};
|
|
662
|
+
}
|
|
663
|
+
var import_node_fs, import_node_path, import_node_crypto, MANIFEST_PATH, CODE_EXTENSIONS, DOC_EXTENSIONS, PAPER_EXTENSIONS, IMAGE_EXTENSIONS, OFFICE_EXTENSIONS, VIDEO_EXTENSIONS, CORPUS_WARN_THRESHOLD, CORPUS_UPPER_THRESHOLD, FILE_COUNT_UPPER, SENSITIVE_PATTERNS, PAPER_SIGNALS, PAPER_SIGNAL_THRESHOLD, ASSET_DIR_MARKERS, SKIP_DIRS;
|
|
664
|
+
var init_detect = __esm({
|
|
665
|
+
"src/detect.ts"() {
|
|
666
|
+
import_node_fs = require("fs");
|
|
667
|
+
import_node_path = require("path");
|
|
668
|
+
import_node_crypto = require("crypto");
|
|
669
|
+
init_types();
|
|
670
|
+
MANIFEST_PATH = "graphify-out/manifest.json";
|
|
671
|
+
CODE_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
672
|
+
".py",
|
|
673
|
+
".ts",
|
|
674
|
+
".js",
|
|
675
|
+
".jsx",
|
|
676
|
+
".tsx",
|
|
677
|
+
".go",
|
|
678
|
+
".rs",
|
|
679
|
+
".java",
|
|
680
|
+
".cpp",
|
|
681
|
+
".cc",
|
|
682
|
+
".cxx",
|
|
683
|
+
".c",
|
|
684
|
+
".h",
|
|
685
|
+
".hpp",
|
|
686
|
+
".rb",
|
|
687
|
+
".swift",
|
|
688
|
+
".kt",
|
|
689
|
+
".kts",
|
|
690
|
+
".cs",
|
|
691
|
+
".scala",
|
|
692
|
+
".php",
|
|
693
|
+
".lua",
|
|
694
|
+
".toc",
|
|
695
|
+
".zig",
|
|
696
|
+
".ps1",
|
|
697
|
+
".ex",
|
|
698
|
+
".exs",
|
|
699
|
+
".m",
|
|
700
|
+
".mm",
|
|
701
|
+
".jl"
|
|
702
|
+
]);
|
|
703
|
+
DOC_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt", ".rst"]);
|
|
704
|
+
PAPER_EXTENSIONS = /* @__PURE__ */ new Set([".pdf"]);
|
|
705
|
+
IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg"]);
|
|
706
|
+
OFFICE_EXTENSIONS = /* @__PURE__ */ new Set([".docx", ".xlsx"]);
|
|
707
|
+
VIDEO_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
708
|
+
".mp4",
|
|
709
|
+
".mov",
|
|
710
|
+
".webm",
|
|
711
|
+
".mkv",
|
|
712
|
+
".avi",
|
|
713
|
+
".m4v",
|
|
714
|
+
".mp3",
|
|
715
|
+
".wav",
|
|
716
|
+
".m4a",
|
|
717
|
+
".ogg"
|
|
718
|
+
]);
|
|
719
|
+
CORPUS_WARN_THRESHOLD = 5e4;
|
|
720
|
+
CORPUS_UPPER_THRESHOLD = 5e5;
|
|
721
|
+
FILE_COUNT_UPPER = 200;
|
|
722
|
+
SENSITIVE_PATTERNS = [
|
|
723
|
+
/(^|[\\/])\.(env|envrc)(\.|$)/i,
|
|
724
|
+
/\.(pem|key|p12|pfx|cert|crt|der|p8)$/i,
|
|
725
|
+
/(credential|secret|passwd|password|token|private_key)/i,
|
|
726
|
+
/(id_rsa|id_dsa|id_ecdsa|id_ed25519)(\.pub)?$/,
|
|
727
|
+
/(\.netrc|\.pgpass|\.htpasswd)$/i,
|
|
728
|
+
/(aws_credentials|gcloud_credentials|service.account)/i
|
|
729
|
+
];
|
|
730
|
+
PAPER_SIGNALS = [
|
|
731
|
+
/\barxiv\b/i,
|
|
732
|
+
/\bdoi\s*:/i,
|
|
733
|
+
/\babstract\b/i,
|
|
734
|
+
/\bproceedings\b/i,
|
|
735
|
+
/\bjournal\b/i,
|
|
736
|
+
/\bpreprint\b/i,
|
|
737
|
+
/\\cite\{/,
|
|
738
|
+
/\[\d+\]/,
|
|
739
|
+
/\[\n\d+\n\]/,
|
|
740
|
+
/eq\.\s*\d+|equation\s+\d+/i,
|
|
741
|
+
/\d{4}\.\d{4,5}/,
|
|
742
|
+
/\bwe propose\b/i,
|
|
743
|
+
/\bliterature\b/i
|
|
744
|
+
];
|
|
745
|
+
PAPER_SIGNAL_THRESHOLD = 3;
|
|
746
|
+
ASSET_DIR_MARKERS = /* @__PURE__ */ new Set([".imageset", ".xcassets", ".appiconset", ".colorset", ".launchimage"]);
|
|
747
|
+
SKIP_DIRS = /* @__PURE__ */ new Set([
|
|
748
|
+
"venv",
|
|
749
|
+
".venv",
|
|
750
|
+
"env",
|
|
751
|
+
".env",
|
|
752
|
+
"node_modules",
|
|
753
|
+
"__pycache__",
|
|
754
|
+
".git",
|
|
755
|
+
"dist",
|
|
756
|
+
"build",
|
|
757
|
+
"target",
|
|
758
|
+
"out",
|
|
759
|
+
"site-packages",
|
|
760
|
+
"lib64",
|
|
761
|
+
".pytest_cache",
|
|
762
|
+
".mypy_cache",
|
|
763
|
+
".ruff_cache",
|
|
764
|
+
".tox",
|
|
765
|
+
".eggs"
|
|
766
|
+
]);
|
|
767
|
+
}
|
|
768
|
+
});
|
|
769
|
+
|
|
329
770
|
// src/analyze.ts
|
|
330
771
|
var analyze_exports = {};
|
|
331
772
|
__export(analyze_exports, {
|
|
@@ -366,10 +807,11 @@ function isConceptNode(G, nodeId) {
|
|
|
366
807
|
return false;
|
|
367
808
|
}
|
|
368
809
|
function fileCategory(path) {
|
|
369
|
-
const ext = path.includes(".") ? path.split(".").pop()?.toLowerCase() ?? "" : "";
|
|
810
|
+
const ext = path.includes(".") ? `.${path.split(".").pop()?.toLowerCase() ?? ""}` : "";
|
|
370
811
|
if (CODE_EXTENSIONS.has(ext)) return "code";
|
|
371
812
|
if (PAPER_EXTENSIONS.has(ext)) return "paper";
|
|
372
813
|
if (IMAGE_EXTENSIONS.has(ext)) return "image";
|
|
814
|
+
if (DOC_EXTENSIONS.has(ext)) return "doc";
|
|
373
815
|
return "doc";
|
|
374
816
|
}
|
|
375
817
|
function topLevelDir(path) {
|
|
@@ -563,10 +1005,10 @@ function suggestQuestions(G, communities, communityLabels, topN = 7) {
|
|
|
563
1005
|
const cid = nodeCommunity.get(nodeId);
|
|
564
1006
|
const commLabel = cid !== void 0 ? labelMap.get(cid) ?? `Community ${cid}` : "unknown";
|
|
565
1007
|
const neighborComms = /* @__PURE__ */ new Set();
|
|
566
|
-
G
|
|
1008
|
+
for (const n of traversalNeighbors(G, nodeId)) {
|
|
567
1009
|
const nc = nodeCommunity.get(n);
|
|
568
1010
|
if (nc !== void 0 && nc !== cid) neighborComms.add(nc);
|
|
569
|
-
}
|
|
1011
|
+
}
|
|
570
1012
|
if (neighborComms.size > 0) {
|
|
571
1013
|
const otherLabels = [...neighborComms].map((c) => labelMap.get(c) ?? `Community ${c}`);
|
|
572
1014
|
questions.push({
|
|
@@ -695,31 +1137,14 @@ function graphDiff(GOld, GNew) {
|
|
|
695
1137
|
summary: parts.length > 0 ? parts.join(", ") : "no changes"
|
|
696
1138
|
};
|
|
697
1139
|
}
|
|
698
|
-
var import_betweenness
|
|
1140
|
+
var import_betweenness;
|
|
699
1141
|
var init_analyze = __esm({
|
|
700
1142
|
"src/analyze.ts"() {
|
|
701
1143
|
import_betweenness = __toESM(require("graphology-metrics/centrality/betweenness.js"), 1);
|
|
702
1144
|
init_collections();
|
|
703
1145
|
init_cluster();
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
"ts",
|
|
707
|
-
"tsx",
|
|
708
|
-
"js",
|
|
709
|
-
"go",
|
|
710
|
-
"rs",
|
|
711
|
-
"java",
|
|
712
|
-
"rb",
|
|
713
|
-
"cpp",
|
|
714
|
-
"c",
|
|
715
|
-
"h",
|
|
716
|
-
"cs",
|
|
717
|
-
"kt",
|
|
718
|
-
"scala",
|
|
719
|
-
"php"
|
|
720
|
-
]);
|
|
721
|
-
PAPER_EXTENSIONS = /* @__PURE__ */ new Set(["pdf"]);
|
|
722
|
-
IMAGE_EXTENSIONS = /* @__PURE__ */ new Set(["png", "jpg", "jpeg", "webp", "gif", "svg"]);
|
|
1146
|
+
init_graph();
|
|
1147
|
+
init_detect();
|
|
723
1148
|
}
|
|
724
1149
|
});
|
|
725
1150
|
|
|
@@ -983,19 +1408,19 @@ async function safeFetchText(url, maxBytes = MAX_TEXT_BYTES, timeout = 15e3) {
|
|
|
983
1408
|
return raw.toString("utf-8");
|
|
984
1409
|
}
|
|
985
1410
|
function validateGraphPath(filePath, base) {
|
|
986
|
-
const resolvedBase = (0,
|
|
987
|
-
if (!(0,
|
|
1411
|
+
const resolvedBase = (0, import_node_path2.resolve)(base ?? "graphify-out");
|
|
1412
|
+
if (!(0, import_node_fs2.existsSync)(resolvedBase)) {
|
|
988
1413
|
throw new Error(
|
|
989
1414
|
`Graph base directory does not exist: ${resolvedBase}. Run the graphify skill first to build the graph (for Codex: $graphify .).`
|
|
990
1415
|
);
|
|
991
1416
|
}
|
|
992
|
-
const resolved = (0,
|
|
1417
|
+
const resolved = (0, import_node_path2.resolve)(filePath);
|
|
993
1418
|
if (!resolved.startsWith(resolvedBase + "/") && resolved !== resolvedBase) {
|
|
994
1419
|
throw new Error(
|
|
995
1420
|
`Path '${filePath}' escapes the allowed directory ${resolvedBase}. Only paths inside graphify-out/ are permitted.`
|
|
996
1421
|
);
|
|
997
1422
|
}
|
|
998
|
-
if (!(0,
|
|
1423
|
+
if (!(0, import_node_fs2.existsSync)(resolved)) {
|
|
999
1424
|
throw new Error(`Graph file not found: ${resolved}`);
|
|
1000
1425
|
}
|
|
1001
1426
|
return resolved;
|
|
@@ -1010,11 +1435,11 @@ function sanitizeLabel(text) {
|
|
|
1010
1435
|
function escapeHtml(text) {
|
|
1011
1436
|
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
1012
1437
|
}
|
|
1013
|
-
var
|
|
1438
|
+
var import_node_path2, import_node_fs2, import_node_url, dns, net, ALLOWED_SCHEMES, MAX_FETCH_BYTES, MAX_TEXT_BYTES, BLOCKED_HOSTS, CONTROL_CHAR_RE, MAX_LABEL_LEN;
|
|
1014
1439
|
var init_security = __esm({
|
|
1015
1440
|
"src/security.ts"() {
|
|
1016
|
-
|
|
1017
|
-
|
|
1441
|
+
import_node_path2 = require("path");
|
|
1442
|
+
import_node_fs2 = require("fs");
|
|
1018
1443
|
import_node_url = require("url");
|
|
1019
1444
|
dns = __toESM(require("dns/promises"), 1);
|
|
1020
1445
|
net = __toESM(require("net"), 1);
|
|
@@ -1065,14 +1490,17 @@ function normalizeCommunityLabels(labelsOrOptions) {
|
|
|
1065
1490
|
}
|
|
1066
1491
|
return toNumericMap(labelsOrOptions.communityLabels);
|
|
1067
1492
|
}
|
|
1068
|
-
function toJson(G, communities, outputPath) {
|
|
1493
|
+
function toJson(G, communities, outputPath, communityLabelsOrOptions) {
|
|
1069
1494
|
const nodeComm = nodeCommunityMap2(communities);
|
|
1495
|
+
const communityLabels = normalizeCommunityLabels(communityLabelsOrOptions);
|
|
1070
1496
|
const nodes = [];
|
|
1071
1497
|
G.forEachNode((nodeId, attrs) => {
|
|
1498
|
+
const communityId = nodeComm.get(nodeId) ?? null;
|
|
1072
1499
|
nodes.push({
|
|
1073
1500
|
id: nodeId,
|
|
1074
1501
|
...attrs,
|
|
1075
|
-
community:
|
|
1502
|
+
community: communityId,
|
|
1503
|
+
community_name: communityId !== null ? sanitizeLabel(communityLabels?.get(communityId) ?? `Community ${communityId}`) : null
|
|
1076
1504
|
});
|
|
1077
1505
|
});
|
|
1078
1506
|
const links = [];
|
|
@@ -1089,15 +1517,20 @@ function toJson(G, communities, outputPath) {
|
|
|
1089
1517
|
links.push(link);
|
|
1090
1518
|
});
|
|
1091
1519
|
const hyperedges = G.getAttribute("hyperedges") ?? [];
|
|
1520
|
+
const communityLabelsObject = communityLabels ? Object.fromEntries(
|
|
1521
|
+
[...communityLabels.entries()].sort((a, b) => a[0] - b[0]).map(([cid, label]) => [String(cid), sanitizeLabel(label)])
|
|
1522
|
+
) : {};
|
|
1092
1523
|
const output = {
|
|
1093
|
-
directed:
|
|
1524
|
+
directed: isDirectedGraph(G),
|
|
1094
1525
|
multigraph: false,
|
|
1095
|
-
graph: {
|
|
1526
|
+
graph: {
|
|
1527
|
+
community_labels: communityLabelsObject
|
|
1528
|
+
},
|
|
1096
1529
|
nodes,
|
|
1097
1530
|
links,
|
|
1098
1531
|
hyperedges
|
|
1099
1532
|
};
|
|
1100
|
-
(0,
|
|
1533
|
+
(0, import_node_fs3.writeFileSync)(outputPath, JSON.stringify(output, null, 2), "utf-8");
|
|
1101
1534
|
}
|
|
1102
1535
|
function toCypher(G, outputPath) {
|
|
1103
1536
|
const lines = ["// Neo4j Cypher import - generated by the graphify skill", ""];
|
|
@@ -1119,7 +1552,7 @@ function toCypher(G, outputPath) {
|
|
|
1119
1552
|
`MATCH (a {id: '${uEsc}'}), (b {id: '${vEsc}'}) MERGE (a)-[:${rel} {confidence: '${conf}'}]->(b);`
|
|
1120
1553
|
);
|
|
1121
1554
|
});
|
|
1122
|
-
(0,
|
|
1555
|
+
(0, import_node_fs3.writeFileSync)(outputPath, lines.join("\n"), "utf-8");
|
|
1123
1556
|
}
|
|
1124
1557
|
function neo4jLabel(label) {
|
|
1125
1558
|
const sanitized = label.replace(/[^A-Za-z0-9_]/g, "");
|
|
@@ -1358,9 +1791,24 @@ function focusNode(nodeId) {
|
|
|
1358
1791
|
showInfo(nodeId);
|
|
1359
1792
|
}
|
|
1360
1793
|
|
|
1794
|
+
let hoveredNodeId = null;
|
|
1795
|
+
network.on('hoverNode', params => {
|
|
1796
|
+
hoveredNodeId = params.node;
|
|
1797
|
+
container.style.cursor = 'pointer';
|
|
1798
|
+
});
|
|
1799
|
+
network.on('blurNode', () => {
|
|
1800
|
+
hoveredNodeId = null;
|
|
1801
|
+
container.style.cursor = 'default';
|
|
1802
|
+
});
|
|
1803
|
+
container.addEventListener('click', () => {
|
|
1804
|
+
if (hoveredNodeId !== null) {
|
|
1805
|
+
showInfo(hoveredNodeId);
|
|
1806
|
+
network.selectNodes([hoveredNodeId]);
|
|
1807
|
+
}
|
|
1808
|
+
});
|
|
1361
1809
|
network.on('click', params => {
|
|
1362
1810
|
if (params.nodes.length > 0) showInfo(params.nodes[0]);
|
|
1363
|
-
else document.getElementById('info-content').innerHTML = '<span class="empty">Click a node to inspect it</span>';
|
|
1811
|
+
else if (hoveredNodeId === null) document.getElementById('info-content').innerHTML = '<span class="empty">Click a node to inspect it</span>';
|
|
1364
1812
|
});
|
|
1365
1813
|
|
|
1366
1814
|
const searchInput = document.getElementById('search');
|
|
@@ -1515,7 +1963,7 @@ ${htmlScript(nodesJson, edgesJson, legendJson)}
|
|
|
1515
1963
|
${hyperedgeScript(hyperedgesJson)}
|
|
1516
1964
|
</body>
|
|
1517
1965
|
</html>`;
|
|
1518
|
-
(0,
|
|
1966
|
+
(0, import_node_fs3.writeFileSync)(outputPath, html, "utf-8");
|
|
1519
1967
|
}
|
|
1520
1968
|
function toGraphml(G, communities, outputPath) {
|
|
1521
1969
|
const nodeComm = nodeCommunityMap2(communities);
|
|
@@ -1531,7 +1979,7 @@ function toGraphml(G, communities, outputPath) {
|
|
|
1531
1979
|
lines.push(' <key id="community" for="node" attr.name="community" attr.type="int"/>');
|
|
1532
1980
|
lines.push(' <key id="relation" for="edge" attr.name="relation" attr.type="string"/>');
|
|
1533
1981
|
lines.push(' <key id="confidence" for="edge" attr.name="confidence" attr.type="string"/>');
|
|
1534
|
-
lines.push(
|
|
1982
|
+
lines.push(` <graph id="G" edgedefault="${isDirectedGraph(G) ? "directed" : "undirected"}">`);
|
|
1535
1983
|
G.forEachNode((nodeId, data) => {
|
|
1536
1984
|
lines.push(` <node id="${xmlEsc(nodeId)}">`);
|
|
1537
1985
|
lines.push(` <data key="label">${xmlEsc(data.label ?? nodeId)}</data>`);
|
|
@@ -1548,7 +1996,7 @@ function toGraphml(G, communities, outputPath) {
|
|
|
1548
1996
|
});
|
|
1549
1997
|
lines.push(" </graph>");
|
|
1550
1998
|
lines.push("</graphml>");
|
|
1551
|
-
(0,
|
|
1999
|
+
(0, import_node_fs3.writeFileSync)(outputPath, lines.join("\n"), "utf-8");
|
|
1552
2000
|
}
|
|
1553
2001
|
function toSvg(G, communities, outputPath, communityLabelsOrOptions, figsize = [20, 14]) {
|
|
1554
2002
|
const communityMap = toNumericMap(communities);
|
|
@@ -1621,7 +2069,7 @@ function toSvg(G, communities, outputPath, communityLabelsOrOptions, figsize = [
|
|
|
1621
2069
|
}
|
|
1622
2070
|
}
|
|
1623
2071
|
svgParts.push("</svg>");
|
|
1624
|
-
(0,
|
|
2072
|
+
(0, import_node_fs3.writeFileSync)(outputPath, svgParts.join("\n"), "utf-8");
|
|
1625
2073
|
}
|
|
1626
2074
|
function toCanvas(G, communities, outputPath, communityLabelsOrOptions, nodeFilenames) {
|
|
1627
2075
|
const communityMap = toNumericMap(communities);
|
|
@@ -1630,7 +2078,7 @@ function toCanvas(G, communities, outputPath, communityLabelsOrOptions, nodeFile
|
|
|
1630
2078
|
const providedNodeFilenames = options?.nodeFilenames ?? nodeFilenames;
|
|
1631
2079
|
const CANVAS_COLORS = ["1", "2", "3", "4", "5", "6"];
|
|
1632
2080
|
function safeName(label) {
|
|
1633
|
-
return label.replace(/[\\/*?:"<>|#^[\]]/g, "").trim() || "unnamed";
|
|
2081
|
+
return label.replace(/\r\n/g, " ").replace(/\r/g, " ").replace(/\n/g, " ").replace(/[\\/*?:"<>|#^[\]]/g, "").trim() || "unnamed";
|
|
1634
2082
|
}
|
|
1635
2083
|
let filenameMap;
|
|
1636
2084
|
if (!providedNodeFilenames) {
|
|
@@ -1709,13 +2157,13 @@ function toCanvas(G, communities, outputPath, communityLabelsOrOptions, nodeFile
|
|
|
1709
2157
|
for (let idx = 0; idx < sortedCids.length; idx++) {
|
|
1710
2158
|
const cid = sortedCids[idx];
|
|
1711
2159
|
const members = communityMap.get(cid) ?? [];
|
|
1712
|
-
const
|
|
2160
|
+
const communityName2 = communityLabels?.get(cid) ?? `Community ${cid}`;
|
|
1713
2161
|
const [gx, gy, gw, gh] = groupLayout.get(cid) ?? [0, 0, 600, 400];
|
|
1714
2162
|
const canvasColor = CANVAS_COLORS[idx % CANVAS_COLORS.length];
|
|
1715
2163
|
canvasNodes.push({
|
|
1716
2164
|
id: `g${cid}`,
|
|
1717
2165
|
type: "group",
|
|
1718
|
-
label:
|
|
2166
|
+
label: communityName2,
|
|
1719
2167
|
x: gx,
|
|
1720
2168
|
y: gy,
|
|
1721
2169
|
width: gw,
|
|
@@ -1765,13 +2213,14 @@ function toCanvas(G, communities, outputPath, communityLabelsOrOptions, nodeFile
|
|
|
1765
2213
|
});
|
|
1766
2214
|
}
|
|
1767
2215
|
const canvasData = { nodes: canvasNodes, edges: canvasEdges };
|
|
1768
|
-
(0,
|
|
2216
|
+
(0, import_node_fs3.writeFileSync)(outputPath, JSON.stringify(canvasData, null, 2), "utf-8");
|
|
1769
2217
|
}
|
|
1770
|
-
var
|
|
2218
|
+
var import_node_fs3, COMMUNITY_COLORS, MAX_NODES_FOR_VIZ, CONFIDENCE_SCORE_DEFAULTS;
|
|
1771
2219
|
var init_export = __esm({
|
|
1772
2220
|
"src/export.ts"() {
|
|
1773
|
-
|
|
2221
|
+
import_node_fs3 = require("fs");
|
|
1774
2222
|
init_security();
|
|
2223
|
+
init_graph();
|
|
1775
2224
|
init_collections();
|
|
1776
2225
|
COMMUNITY_COLORS = [
|
|
1777
2226
|
"#4E79A7",
|
|
@@ -1795,8 +2244,20 @@ var init_export = __esm({
|
|
|
1795
2244
|
});
|
|
1796
2245
|
|
|
1797
2246
|
// src/cache.ts
|
|
2247
|
+
function bodyContent(content) {
|
|
2248
|
+
const text = content.toString("utf-8");
|
|
2249
|
+
if (!text.startsWith("---")) {
|
|
2250
|
+
return content;
|
|
2251
|
+
}
|
|
2252
|
+
const end = text.indexOf("\n---", 3);
|
|
2253
|
+
if (end === -1) {
|
|
2254
|
+
return content;
|
|
2255
|
+
}
|
|
2256
|
+
return Buffer.from(text.slice(end + 4), "utf-8");
|
|
2257
|
+
}
|
|
1798
2258
|
function fileHash(filePath) {
|
|
1799
|
-
const
|
|
2259
|
+
const raw = (0, import_node_fs5.readFileSync)(filePath);
|
|
2260
|
+
const content = (0, import_node_path4.extname)(filePath).toLowerCase() === ".md" ? bodyContent(raw) : raw;
|
|
1800
2261
|
const resolved = (0, import_node_path4.resolve)(filePath);
|
|
1801
2262
|
const h = (0, import_node_crypto2.createHash)("sha256");
|
|
1802
2263
|
h.update(content);
|
|
@@ -2744,10 +3205,10 @@ async function _extractGeneric(filePath, config) {
|
|
|
2744
3205
|
source: callerNid,
|
|
2745
3206
|
target: tgtNid,
|
|
2746
3207
|
relation: "calls",
|
|
2747
|
-
confidence: "
|
|
3208
|
+
confidence: "EXTRACTED",
|
|
2748
3209
|
source_file: strPath,
|
|
2749
3210
|
source_location: `L${line}`,
|
|
2750
|
-
weight:
|
|
3211
|
+
weight: 1
|
|
2751
3212
|
});
|
|
2752
3213
|
}
|
|
2753
3214
|
}
|
|
@@ -3270,10 +3731,10 @@ async function extractGo(filePath) {
|
|
|
3270
3731
|
source: callerNid,
|
|
3271
3732
|
target: tgtNid,
|
|
3272
3733
|
relation: "calls",
|
|
3273
|
-
confidence: "
|
|
3734
|
+
confidence: "EXTRACTED",
|
|
3274
3735
|
source_file: strPath,
|
|
3275
3736
|
source_location: `L${line}`,
|
|
3276
|
-
weight:
|
|
3737
|
+
weight: 1
|
|
3277
3738
|
});
|
|
3278
3739
|
}
|
|
3279
3740
|
}
|
|
@@ -3425,10 +3886,10 @@ async function extractRust(filePath) {
|
|
|
3425
3886
|
source: callerNid,
|
|
3426
3887
|
target: tgtNid,
|
|
3427
3888
|
relation: "calls",
|
|
3428
|
-
confidence: "
|
|
3889
|
+
confidence: "EXTRACTED",
|
|
3429
3890
|
source_file: strPath,
|
|
3430
3891
|
source_location: `L${line}`,
|
|
3431
|
-
weight:
|
|
3892
|
+
weight: 1
|
|
3432
3893
|
});
|
|
3433
3894
|
}
|
|
3434
3895
|
}
|
|
@@ -3587,7 +4048,7 @@ async function extractZig(filePath) {
|
|
|
3587
4048
|
const pair = `${callerNid}|${tgtNid}`;
|
|
3588
4049
|
if (!seenCallPairs.has(pair)) {
|
|
3589
4050
|
seenCallPairs.add(pair);
|
|
3590
|
-
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "
|
|
4051
|
+
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "EXTRACTED", 1);
|
|
3591
4052
|
}
|
|
3592
4053
|
}
|
|
3593
4054
|
}
|
|
@@ -3770,7 +4231,7 @@ async function extractPowershell(filePath) {
|
|
|
3770
4231
|
const pair = `${callerNid}|${tgtNid}`;
|
|
3771
4232
|
if (!seenCallPairs.has(pair)) {
|
|
3772
4233
|
seenCallPairs.add(pair);
|
|
3773
|
-
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "
|
|
4234
|
+
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "EXTRACTED", 1);
|
|
3774
4235
|
}
|
|
3775
4236
|
}
|
|
3776
4237
|
}
|
|
@@ -3976,7 +4437,7 @@ async function extractObjc(filePath) {
|
|
|
3976
4437
|
const pair = `${callerNid}|${candidate}`;
|
|
3977
4438
|
if (!seenCalls.has(pair) && callerNid !== candidate) {
|
|
3978
4439
|
seenCalls.add(pair);
|
|
3979
|
-
addEdge(callerNid, candidate, "calls", bodyNode.startPosition.row + 1, "
|
|
4440
|
+
addEdge(callerNid, candidate, "calls", bodyNode.startPosition.row + 1, "EXTRACTED", 1);
|
|
3980
4441
|
}
|
|
3981
4442
|
}
|
|
3982
4443
|
}
|
|
@@ -4167,7 +4628,7 @@ async function extractElixir(filePath) {
|
|
|
4167
4628
|
const pair = `${callerNid}|${tgtNid}`;
|
|
4168
4629
|
if (!seenCallPairs.has(pair)) {
|
|
4169
4630
|
seenCallPairs.add(pair);
|
|
4170
|
-
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "
|
|
4631
|
+
addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "EXTRACTED", 1);
|
|
4171
4632
|
}
|
|
4172
4633
|
}
|
|
4173
4634
|
}
|
|
@@ -4696,8 +5157,10 @@ var src_exports = {};
|
|
|
4696
5157
|
__export(src_exports, {
|
|
4697
5158
|
FileType: () => FileType,
|
|
4698
5159
|
assertValid: () => assertValid,
|
|
5160
|
+
augmentDetectionWithTranscripts: () => augmentDetectionWithTranscripts,
|
|
4699
5161
|
build: () => build,
|
|
4700
5162
|
buildFromJson: () => buildFromJson,
|
|
5163
|
+
buildWhisperPrompt: () => buildWhisperPrompt,
|
|
4701
5164
|
checkSemanticCache: () => checkSemanticCache,
|
|
4702
5165
|
classifyFile: () => classifyFile,
|
|
4703
5166
|
cluster: () => cluster,
|
|
@@ -4705,6 +5168,7 @@ __export(src_exports, {
|
|
|
4705
5168
|
collectFiles: () => collectFiles,
|
|
4706
5169
|
detect: () => detect,
|
|
4707
5170
|
detectIncremental: () => detectIncremental,
|
|
5171
|
+
downloadAudio: () => downloadAudio,
|
|
4708
5172
|
extract: () => extract,
|
|
4709
5173
|
fileHash: () => fileHash,
|
|
4710
5174
|
generateReport: () => generate,
|
|
@@ -4734,23 +5198,15 @@ __export(src_exports, {
|
|
|
4734
5198
|
toJson: () => toJson,
|
|
4735
5199
|
toSvg: () => toSvg,
|
|
4736
5200
|
toWiki: () => toWiki,
|
|
5201
|
+
transcribe: () => transcribe,
|
|
5202
|
+
transcribeAll: () => transcribeAll,
|
|
4737
5203
|
validateExtraction: () => validateExtraction,
|
|
4738
5204
|
validateGraphPath: () => validateGraphPath,
|
|
4739
5205
|
validateUrl: () => validateUrl,
|
|
4740
5206
|
watch: () => watch
|
|
4741
5207
|
});
|
|
4742
5208
|
module.exports = __toCommonJS(src_exports);
|
|
4743
|
-
|
|
4744
|
-
// src/types.ts
|
|
4745
|
-
var FileType = /* @__PURE__ */ ((FileType2) => {
|
|
4746
|
-
FileType2["CODE"] = "code";
|
|
4747
|
-
FileType2["DOCUMENT"] = "document";
|
|
4748
|
-
FileType2["PAPER"] = "paper";
|
|
4749
|
-
FileType2["IMAGE"] = "image";
|
|
4750
|
-
return FileType2;
|
|
4751
|
-
})(FileType || {});
|
|
4752
|
-
|
|
4753
|
-
// src/index.ts
|
|
5209
|
+
init_types();
|
|
4754
5210
|
init_validate();
|
|
4755
5211
|
init_build();
|
|
4756
5212
|
init_cluster();
|
|
@@ -4759,23 +5215,24 @@ init_report();
|
|
|
4759
5215
|
init_export();
|
|
4760
5216
|
|
|
4761
5217
|
// src/wiki.ts
|
|
4762
|
-
var
|
|
4763
|
-
var
|
|
5218
|
+
var import_node_fs4 = require("fs");
|
|
5219
|
+
var import_node_path3 = require("path");
|
|
4764
5220
|
init_collections();
|
|
5221
|
+
init_graph();
|
|
4765
5222
|
function safeFilename(name) {
|
|
4766
|
-
return name.replace(/\//g, "-").replace(/ /g, "_").replace(/:/g, "-");
|
|
5223
|
+
return name.replace(/\r\n/g, " ").replace(/\r/g, " ").replace(/\n/g, " ").replace(/\//g, "-").replace(/ /g, "_").replace(/:/g, "-");
|
|
4767
5224
|
}
|
|
4768
5225
|
function crossCommunityLinks(G, nodes, ownCid, labels) {
|
|
4769
5226
|
const labelMap = toNumericMap(labels);
|
|
4770
5227
|
const counts = /* @__PURE__ */ new Map();
|
|
4771
5228
|
for (const nid of nodes) {
|
|
4772
|
-
G
|
|
5229
|
+
for (const neighbor of traversalNeighbors(G, nid)) {
|
|
4773
5230
|
const ncid = G.getNodeAttribute(neighbor, "community");
|
|
4774
5231
|
if (ncid !== void 0 && ncid !== ownCid) {
|
|
4775
5232
|
const label = labelMap.get(ncid) ?? `Community ${ncid}`;
|
|
4776
5233
|
counts.set(label, (counts.get(label) ?? 0) + 1);
|
|
4777
5234
|
}
|
|
4778
|
-
}
|
|
5235
|
+
}
|
|
4779
5236
|
}
|
|
4780
5237
|
return [...counts.entries()].sort((a, b) => b[1] - a[1]);
|
|
4781
5238
|
}
|
|
@@ -4839,15 +5296,15 @@ function godNodeArticle(G, nid, labels) {
|
|
|
4839
5296
|
const nodeLabel = d.label ?? nid;
|
|
4840
5297
|
const src = d.source_file ?? "";
|
|
4841
5298
|
const cid = d.community;
|
|
4842
|
-
const
|
|
5299
|
+
const communityName2 = cid !== void 0 ? labels.get(cid) ?? `Community ${cid}` : void 0;
|
|
4843
5300
|
const lines = [];
|
|
4844
5301
|
lines.push(`# ${nodeLabel}`, "");
|
|
4845
5302
|
lines.push(`> God node \xB7 ${G.degree(nid)} connections \xB7 \`${src}\``, "");
|
|
4846
|
-
if (
|
|
4847
|
-
lines.push(`**Community:** [[${
|
|
5303
|
+
if (communityName2) {
|
|
5304
|
+
lines.push(`**Community:** [[${communityName2}]]`, "");
|
|
4848
5305
|
}
|
|
4849
5306
|
const byRelation = /* @__PURE__ */ new Map();
|
|
4850
|
-
const neighbors =
|
|
5307
|
+
const neighbors = traversalNeighbors(G, nid).sort((a, b) => G.degree(b) - G.degree(a));
|
|
4851
5308
|
for (const neighbor of neighbors) {
|
|
4852
5309
|
const ed = G.getEdgeAttributes(G.edge(nid, neighbor));
|
|
4853
5310
|
const rel = ed.relation ?? "related";
|
|
@@ -4860,408 +5317,85 @@ function godNodeArticle(G, nid, labels) {
|
|
|
4860
5317
|
lines.push("## Connections by Relation", "");
|
|
4861
5318
|
for (const [rel, targets] of [...byRelation.entries()].sort()) {
|
|
4862
5319
|
lines.push(`### ${rel}`);
|
|
4863
|
-
for (const t of targets.slice(0, 20)) {
|
|
4864
|
-
lines.push(`- ${t}`);
|
|
4865
|
-
}
|
|
4866
|
-
lines.push("");
|
|
4867
|
-
}
|
|
4868
|
-
lines.push("---", "", "*Part of the graphify knowledge wiki. See [[index]] to navigate.*");
|
|
4869
|
-
return lines.join("\n");
|
|
4870
|
-
}
|
|
4871
|
-
function indexMd(communities, labels, godNodesData, totalNodes, totalEdges) {
|
|
4872
|
-
const lines = [
|
|
4873
|
-
"# Knowledge Graph Index",
|
|
4874
|
-
"",
|
|
4875
|
-
"> Auto-generated by graphify. Start here \u2014 read community articles for context, then drill into god nodes for detail.",
|
|
4876
|
-
"",
|
|
4877
|
-
`**${totalNodes} nodes \xB7 ${totalEdges} edges \xB7 ${communities.size} communities**`,
|
|
4878
|
-
"",
|
|
4879
|
-
"---",
|
|
4880
|
-
"",
|
|
4881
|
-
"## Communities",
|
|
4882
|
-
"(sorted by size, largest first)",
|
|
4883
|
-
""
|
|
4884
|
-
];
|
|
4885
|
-
const sorted = [...communities.entries()].sort((a, b) => b[1].length - a[1].length);
|
|
4886
|
-
for (const [cid, nodes] of sorted) {
|
|
4887
|
-
const label = labels.get(cid) ?? `Community ${cid}`;
|
|
4888
|
-
lines.push(`- [[${label}]] \u2014 ${nodes.length} nodes`);
|
|
4889
|
-
}
|
|
4890
|
-
lines.push("");
|
|
4891
|
-
if (godNodesData.length > 0) {
|
|
4892
|
-
lines.push("## God Nodes", "(most connected concepts \u2014 the load-bearing abstractions)", "");
|
|
4893
|
-
for (const node of godNodesData) {
|
|
4894
|
-
lines.push(`- [[${node.label}]] \u2014 ${node.edges} connections`);
|
|
4895
|
-
}
|
|
4896
|
-
lines.push("");
|
|
4897
|
-
}
|
|
4898
|
-
lines.push(
|
|
4899
|
-
"---",
|
|
4900
|
-
"",
|
|
4901
|
-
"*Generated by [graphify](https://github.com/safishamsi/graphify)*"
|
|
4902
|
-
);
|
|
4903
|
-
return lines.join("\n");
|
|
4904
|
-
}
|
|
4905
|
-
function toWiki(G, communities, outputDir, options) {
|
|
4906
|
-
const communityMap = toNumericMap(communities);
|
|
4907
|
-
(0, import_node_fs3.mkdirSync)(outputDir, { recursive: true });
|
|
4908
|
-
const labels = options?.communityLabels ? toNumericMap(options.communityLabels) : new Map([...communityMap.keys()].map((cid) => [cid, `Community ${cid}`]));
|
|
4909
|
-
const cohesion = toNumericMap(options?.cohesion);
|
|
4910
|
-
const godNodesData = options?.godNodesData ?? [];
|
|
4911
|
-
let count = 0;
|
|
4912
|
-
for (const [cid, nodes] of communityMap) {
|
|
4913
|
-
const label = labels.get(cid) ?? `Community ${cid}`;
|
|
4914
|
-
const article = communityArticle(G, cid, nodes, label, labels, cohesion.get(cid));
|
|
4915
|
-
(0, import_node_fs3.writeFileSync)((0, import_node_path2.join)(outputDir, `${safeFilename(label)}.md`), article);
|
|
4916
|
-
count++;
|
|
4917
|
-
}
|
|
4918
|
-
for (const nodeData of godNodesData) {
|
|
4919
|
-
const nid = nodeData.id;
|
|
4920
|
-
if (nid && G.hasNode(nid)) {
|
|
4921
|
-
const article = godNodeArticle(G, nid, labels);
|
|
4922
|
-
(0, import_node_fs3.writeFileSync)((0, import_node_path2.join)(outputDir, `${safeFilename(nodeData.label)}.md`), article);
|
|
4923
|
-
count++;
|
|
4924
|
-
}
|
|
4925
|
-
}
|
|
4926
|
-
(0, import_node_fs3.writeFileSync)(
|
|
4927
|
-
(0, import_node_path2.join)(outputDir, "index.md"),
|
|
4928
|
-
indexMd(communityMap, labels, godNodesData, G.order, G.size)
|
|
4929
|
-
);
|
|
4930
|
-
return count;
|
|
4931
|
-
}
|
|
4932
|
-
|
|
4933
|
-
// src/detect.ts
|
|
4934
|
-
var import_node_fs4 = require("fs");
|
|
4935
|
-
var import_node_path3 = require("path");
|
|
4936
|
-
var import_node_crypto = require("crypto");
|
|
4937
|
-
var MANIFEST_PATH = "graphify-out/manifest.json";
|
|
4938
|
-
var CODE_EXTENSIONS2 = /* @__PURE__ */ new Set([
|
|
4939
|
-
".py",
|
|
4940
|
-
".ts",
|
|
4941
|
-
".js",
|
|
4942
|
-
".jsx",
|
|
4943
|
-
".tsx",
|
|
4944
|
-
".go",
|
|
4945
|
-
".rs",
|
|
4946
|
-
".java",
|
|
4947
|
-
".cpp",
|
|
4948
|
-
".cc",
|
|
4949
|
-
".cxx",
|
|
4950
|
-
".c",
|
|
4951
|
-
".h",
|
|
4952
|
-
".hpp",
|
|
4953
|
-
".rb",
|
|
4954
|
-
".swift",
|
|
4955
|
-
".kt",
|
|
4956
|
-
".kts",
|
|
4957
|
-
".cs",
|
|
4958
|
-
".scala",
|
|
4959
|
-
".php",
|
|
4960
|
-
".lua",
|
|
4961
|
-
".toc",
|
|
4962
|
-
".zig",
|
|
4963
|
-
".ps1",
|
|
4964
|
-
".ex",
|
|
4965
|
-
".exs",
|
|
4966
|
-
".m",
|
|
4967
|
-
".mm",
|
|
4968
|
-
".jl"
|
|
4969
|
-
]);
|
|
4970
|
-
var DOC_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt", ".rst"]);
|
|
4971
|
-
var PAPER_EXTENSIONS2 = /* @__PURE__ */ new Set([".pdf"]);
|
|
4972
|
-
var IMAGE_EXTENSIONS2 = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg"]);
|
|
4973
|
-
var OFFICE_EXTENSIONS = /* @__PURE__ */ new Set([".docx", ".xlsx"]);
|
|
4974
|
-
var CORPUS_WARN_THRESHOLD = 5e4;
|
|
4975
|
-
var CORPUS_UPPER_THRESHOLD = 5e5;
|
|
4976
|
-
var FILE_COUNT_UPPER = 200;
|
|
4977
|
-
var SENSITIVE_PATTERNS = [
|
|
4978
|
-
/(^|[\\/])\.(env|envrc)(\.|$)/i,
|
|
4979
|
-
/\.(pem|key|p12|pfx|cert|crt|der|p8)$/i,
|
|
4980
|
-
/(credential|secret|passwd|password|token|private_key)/i,
|
|
4981
|
-
/(id_rsa|id_dsa|id_ecdsa|id_ed25519)(\.pub)?$/,
|
|
4982
|
-
/(\.netrc|\.pgpass|\.htpasswd)$/i,
|
|
4983
|
-
/(aws_credentials|gcloud_credentials|service.account)/i
|
|
4984
|
-
];
|
|
4985
|
-
var PAPER_SIGNALS = [
|
|
4986
|
-
/\barxiv\b/i,
|
|
4987
|
-
/\bdoi\s*:/i,
|
|
4988
|
-
/\babstract\b/i,
|
|
4989
|
-
/\bproceedings\b/i,
|
|
4990
|
-
/\bjournal\b/i,
|
|
4991
|
-
/\bpreprint\b/i,
|
|
4992
|
-
/\\cite\{/,
|
|
4993
|
-
/\[\d+\]/,
|
|
4994
|
-
/\[\n\d+\n\]/,
|
|
4995
|
-
/eq\.\s*\d+|equation\s+\d+/i,
|
|
4996
|
-
/\d{4}\.\d{4,5}/,
|
|
4997
|
-
/\bwe propose\b/i,
|
|
4998
|
-
/\bliterature\b/i
|
|
4999
|
-
];
|
|
5000
|
-
var PAPER_SIGNAL_THRESHOLD = 3;
|
|
5001
|
-
function isSensitive(filePath) {
|
|
5002
|
-
const name = (0, import_node_path3.basename)(filePath);
|
|
5003
|
-
return SENSITIVE_PATTERNS.some((p) => p.test(name) || p.test(filePath));
|
|
5004
|
-
}
|
|
5005
|
-
function looksLikePaper(filePath) {
|
|
5006
|
-
try {
|
|
5007
|
-
const text = (0, import_node_fs4.readFileSync)(filePath, "utf-8").slice(0, 3e3);
|
|
5008
|
-
const hits = PAPER_SIGNALS.filter((p) => p.test(text)).length;
|
|
5009
|
-
return hits >= PAPER_SIGNAL_THRESHOLD;
|
|
5010
|
-
} catch {
|
|
5011
|
-
return false;
|
|
5012
|
-
}
|
|
5013
|
-
}
|
|
5014
|
-
var ASSET_DIR_MARKERS = /* @__PURE__ */ new Set([".imageset", ".xcassets", ".appiconset", ".colorset", ".launchimage"]);
|
|
5015
|
-
function classifyFile(filePath) {
|
|
5016
|
-
const ext = (0, import_node_path3.extname)(filePath).toLowerCase();
|
|
5017
|
-
if (CODE_EXTENSIONS2.has(ext)) return "code" /* CODE */;
|
|
5018
|
-
if (PAPER_EXTENSIONS2.has(ext)) {
|
|
5019
|
-
const parts = filePath.split(import_node_path3.sep);
|
|
5020
|
-
if (parts.some((p) => [...ASSET_DIR_MARKERS].some((m) => p.endsWith(m)))) return null;
|
|
5021
|
-
return "paper" /* PAPER */;
|
|
5022
|
-
}
|
|
5023
|
-
if (IMAGE_EXTENSIONS2.has(ext)) return "image" /* IMAGE */;
|
|
5024
|
-
if (DOC_EXTENSIONS.has(ext)) {
|
|
5025
|
-
if (looksLikePaper(filePath)) return "paper" /* PAPER */;
|
|
5026
|
-
return "document" /* DOCUMENT */;
|
|
5027
|
-
}
|
|
5028
|
-
if (OFFICE_EXTENSIONS.has(ext)) return "document" /* DOCUMENT */;
|
|
5029
|
-
return null;
|
|
5030
|
-
}
|
|
5031
|
-
function countWords(filePath) {
|
|
5032
|
-
try {
|
|
5033
|
-
const text = (0, import_node_fs4.readFileSync)(filePath, "utf-8");
|
|
5034
|
-
return text.split(/\s+/).filter(Boolean).length;
|
|
5035
|
-
} catch {
|
|
5036
|
-
return 0;
|
|
5037
|
-
}
|
|
5038
|
-
}
|
|
5039
|
-
var SKIP_DIRS = /* @__PURE__ */ new Set([
|
|
5040
|
-
"venv",
|
|
5041
|
-
".venv",
|
|
5042
|
-
"env",
|
|
5043
|
-
".env",
|
|
5044
|
-
"node_modules",
|
|
5045
|
-
"__pycache__",
|
|
5046
|
-
".git",
|
|
5047
|
-
"dist",
|
|
5048
|
-
"build",
|
|
5049
|
-
"target",
|
|
5050
|
-
"out",
|
|
5051
|
-
"site-packages",
|
|
5052
|
-
"lib64",
|
|
5053
|
-
".pytest_cache",
|
|
5054
|
-
".mypy_cache",
|
|
5055
|
-
".ruff_cache",
|
|
5056
|
-
".tox",
|
|
5057
|
-
".eggs"
|
|
5058
|
-
]);
|
|
5059
|
-
function isNoiseDir(part) {
|
|
5060
|
-
if (SKIP_DIRS.has(part)) return true;
|
|
5061
|
-
if (part.endsWith("_venv") || part.endsWith("_env")) return true;
|
|
5062
|
-
if (part.endsWith(".egg-info")) return true;
|
|
5063
|
-
return false;
|
|
5064
|
-
}
|
|
5065
|
-
function loadGraphifyignore(root) {
|
|
5066
|
-
const ignoreFile = (0, import_node_path3.join)(root, ".graphifyignore");
|
|
5067
|
-
if (!(0, import_node_fs4.existsSync)(ignoreFile)) return [];
|
|
5068
|
-
const patterns = [];
|
|
5069
|
-
for (let line of (0, import_node_fs4.readFileSync)(ignoreFile, "utf-8").split("\n")) {
|
|
5070
|
-
line = line.trim();
|
|
5071
|
-
if (line && !line.startsWith("#")) {
|
|
5072
|
-
patterns.push(line);
|
|
5073
|
-
}
|
|
5074
|
-
}
|
|
5075
|
-
return patterns;
|
|
5076
|
-
}
|
|
5077
|
-
function matchGlob(text, pattern) {
|
|
5078
|
-
const regex = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
5079
|
-
return new RegExp(`^${regex}$`).test(text);
|
|
5080
|
-
}
|
|
5081
|
-
function isIgnored(filePath, root, patterns) {
|
|
5082
|
-
if (patterns.length === 0) return false;
|
|
5083
|
-
let rel;
|
|
5084
|
-
try {
|
|
5085
|
-
rel = (0, import_node_path3.relative)(root, filePath).replace(/\\/g, "/");
|
|
5086
|
-
} catch {
|
|
5087
|
-
return false;
|
|
5088
|
-
}
|
|
5089
|
-
const parts = rel.split("/");
|
|
5090
|
-
for (const pattern of patterns) {
|
|
5091
|
-
const p = pattern.replace(/^\/+|\/+$/g, "");
|
|
5092
|
-
if (!p) continue;
|
|
5093
|
-
if (matchGlob(rel, p)) return true;
|
|
5094
|
-
if (matchGlob((0, import_node_path3.basename)(filePath), p)) return true;
|
|
5095
|
-
for (let i = 0; i < parts.length; i++) {
|
|
5096
|
-
if (matchGlob(parts[i], p)) return true;
|
|
5097
|
-
if (matchGlob(parts.slice(0, i + 1).join("/"), p)) return true;
|
|
5098
|
-
}
|
|
5099
|
-
}
|
|
5100
|
-
return false;
|
|
5101
|
-
}
|
|
5102
|
-
function walkDir(dir, root, ignorePatterns, followSymlinks, skipPrune) {
|
|
5103
|
-
const result = [];
|
|
5104
|
-
let entries;
|
|
5105
|
-
try {
|
|
5106
|
-
entries = (0, import_node_fs4.readdirSync)(dir);
|
|
5107
|
-
} catch {
|
|
5108
|
-
return result;
|
|
5109
|
-
}
|
|
5110
|
-
for (const entry of entries) {
|
|
5111
|
-
const full = (0, import_node_path3.join)(dir, entry);
|
|
5112
|
-
let stat;
|
|
5113
|
-
try {
|
|
5114
|
-
stat = followSymlinks ? (0, import_node_fs4.statSync)(full) : (0, import_node_fs4.lstatSync)(full);
|
|
5115
|
-
} catch {
|
|
5116
|
-
continue;
|
|
5117
|
-
}
|
|
5118
|
-
if (stat.isDirectory()) {
|
|
5119
|
-
if (!skipPrune) {
|
|
5120
|
-
if (entry.startsWith(".")) continue;
|
|
5121
|
-
if (isNoiseDir(entry)) continue;
|
|
5122
|
-
if (isIgnored(full, root, ignorePatterns)) continue;
|
|
5123
|
-
}
|
|
5124
|
-
result.push(...walkDir(full, root, ignorePatterns, followSymlinks, skipPrune));
|
|
5125
|
-
} else if (stat.isFile()) {
|
|
5126
|
-
result.push(full);
|
|
5127
|
-
}
|
|
5128
|
-
}
|
|
5129
|
-
return result;
|
|
5130
|
-
}
|
|
5131
|
-
function detect(root, options) {
|
|
5132
|
-
const followSymlinks = options?.followSymlinks ?? false;
|
|
5133
|
-
const rootResolved = (0, import_node_path3.resolve)(root);
|
|
5134
|
-
const ignorePatterns = loadGraphifyignore(rootResolved);
|
|
5135
|
-
const convertedDir = (0, import_node_path3.join)(rootResolved, "graphify-out", "converted");
|
|
5136
|
-
const memoryDir = (0, import_node_path3.join)(rootResolved, "graphify-out", "memory");
|
|
5137
|
-
const files = {
|
|
5138
|
-
code: [],
|
|
5139
|
-
document: [],
|
|
5140
|
-
paper: [],
|
|
5141
|
-
image: []
|
|
5142
|
-
};
|
|
5143
|
-
let totalWords = 0;
|
|
5144
|
-
const skippedSensitive = [];
|
|
5145
|
-
const allFiles = walkDir(rootResolved, rootResolved, ignorePatterns, followSymlinks, false);
|
|
5146
|
-
if ((0, import_node_fs4.existsSync)(memoryDir)) {
|
|
5147
|
-
allFiles.push(...walkDir(memoryDir, rootResolved, ignorePatterns, followSymlinks, true));
|
|
5148
|
-
}
|
|
5149
|
-
const seen = /* @__PURE__ */ new Set();
|
|
5150
|
-
for (const p of allFiles) {
|
|
5151
|
-
if (seen.has(p)) continue;
|
|
5152
|
-
seen.add(p);
|
|
5153
|
-
const inMemory = (0, import_node_fs4.existsSync)(memoryDir) && p.startsWith(memoryDir);
|
|
5154
|
-
if (!inMemory) {
|
|
5155
|
-
if ((0, import_node_path3.basename)(p).startsWith(".")) continue;
|
|
5156
|
-
if (p.startsWith(convertedDir)) continue;
|
|
5157
|
-
}
|
|
5158
|
-
if (isIgnored(p, rootResolved, ignorePatterns)) continue;
|
|
5159
|
-
if (isSensitive(p)) {
|
|
5160
|
-
skippedSensitive.push(p);
|
|
5161
|
-
continue;
|
|
5162
|
-
}
|
|
5163
|
-
const ftype = classifyFile(p);
|
|
5164
|
-
if (!ftype) continue;
|
|
5165
|
-
if (OFFICE_EXTENSIONS.has((0, import_node_path3.extname)(p).toLowerCase())) {
|
|
5166
|
-
skippedSensitive.push(p + " [office conversion requires async - use pipeline]");
|
|
5167
|
-
continue;
|
|
5168
|
-
}
|
|
5169
|
-
files[ftype].push(p);
|
|
5170
|
-
totalWords += countWords(p);
|
|
5171
|
-
}
|
|
5172
|
-
const totalFiles = Object.values(files).reduce((s, v) => s + v.length, 0);
|
|
5173
|
-
const needsGraph = totalWords >= CORPUS_WARN_THRESHOLD;
|
|
5174
|
-
let warning = null;
|
|
5175
|
-
if (!needsGraph) {
|
|
5176
|
-
warning = `Corpus is ~${totalWords.toLocaleString()} words - fits in a single context window. You may not need a graph.`;
|
|
5177
|
-
} else if (totalWords >= CORPUS_UPPER_THRESHOLD || totalFiles >= FILE_COUNT_UPPER) {
|
|
5178
|
-
warning = `Large corpus: ${totalFiles} files \xB7 ~${totalWords.toLocaleString()} words. Semantic extraction will be expensive (many Claude tokens). Consider running on a subfolder, or use --no-semantic to run AST-only.`;
|
|
5320
|
+
for (const t of targets.slice(0, 20)) {
|
|
5321
|
+
lines.push(`- ${t}`);
|
|
5322
|
+
}
|
|
5323
|
+
lines.push("");
|
|
5179
5324
|
}
|
|
5180
|
-
|
|
5181
|
-
|
|
5182
|
-
total_files: totalFiles,
|
|
5183
|
-
total_words: totalWords,
|
|
5184
|
-
needs_graph: needsGraph,
|
|
5185
|
-
warning,
|
|
5186
|
-
skipped_sensitive: skippedSensitive,
|
|
5187
|
-
graphifyignore_patterns: ignorePatterns.length
|
|
5188
|
-
};
|
|
5325
|
+
lines.push("---", "", "*Part of the graphify knowledge wiki. See [[index]] to navigate.*");
|
|
5326
|
+
return lines.join("\n");
|
|
5189
5327
|
}
|
|
5190
|
-
function
|
|
5191
|
-
|
|
5192
|
-
|
|
5193
|
-
|
|
5194
|
-
|
|
5328
|
+
function indexMd(communities, labels, godNodesData, totalNodes, totalEdges) {
|
|
5329
|
+
const lines = [
|
|
5330
|
+
"# Knowledge Graph Index",
|
|
5331
|
+
"",
|
|
5332
|
+
"> Auto-generated by graphify. Start here \u2014 read community articles for context, then drill into god nodes for detail.",
|
|
5333
|
+
"",
|
|
5334
|
+
`**${totalNodes} nodes \xB7 ${totalEdges} edges \xB7 ${communities.size} communities**`,
|
|
5335
|
+
"",
|
|
5336
|
+
"---",
|
|
5337
|
+
"",
|
|
5338
|
+
"## Communities",
|
|
5339
|
+
"(sorted by size, largest first)",
|
|
5340
|
+
""
|
|
5341
|
+
];
|
|
5342
|
+
const sorted = [...communities.entries()].sort((a, b) => b[1].length - a[1].length);
|
|
5343
|
+
for (const [cid, nodes] of sorted) {
|
|
5344
|
+
const label = labels.get(cid) ?? `Community ${cid}`;
|
|
5345
|
+
lines.push(`- [[${label}]] \u2014 ${nodes.length} nodes`);
|
|
5195
5346
|
}
|
|
5196
|
-
|
|
5197
|
-
|
|
5198
|
-
|
|
5199
|
-
|
|
5200
|
-
|
|
5201
|
-
try {
|
|
5202
|
-
manifest[f] = (0, import_node_fs4.statSync)(f).mtimeMs;
|
|
5203
|
-
} catch {
|
|
5204
|
-
}
|
|
5347
|
+
lines.push("");
|
|
5348
|
+
if (godNodesData.length > 0) {
|
|
5349
|
+
lines.push("## God Nodes", "(most connected concepts \u2014 the load-bearing abstractions)", "");
|
|
5350
|
+
for (const node of godNodesData) {
|
|
5351
|
+
lines.push(`- [[${node.label}]] \u2014 ${node.edges} connections`);
|
|
5205
5352
|
}
|
|
5353
|
+
lines.push("");
|
|
5206
5354
|
}
|
|
5207
|
-
|
|
5208
|
-
|
|
5209
|
-
|
|
5355
|
+
lines.push(
|
|
5356
|
+
"---",
|
|
5357
|
+
"",
|
|
5358
|
+
"*Generated by [graphify](https://github.com/safishamsi/graphify)*"
|
|
5359
|
+
);
|
|
5360
|
+
return lines.join("\n");
|
|
5210
5361
|
}
|
|
5211
|
-
function
|
|
5212
|
-
const
|
|
5213
|
-
|
|
5214
|
-
|
|
5215
|
-
|
|
5216
|
-
|
|
5217
|
-
|
|
5218
|
-
|
|
5219
|
-
|
|
5220
|
-
|
|
5221
|
-
};
|
|
5222
|
-
|
|
5223
|
-
const newFiles = {};
|
|
5224
|
-
const unchangedFiles = {};
|
|
5225
|
-
for (const k of Object.keys(full.files)) {
|
|
5226
|
-
newFiles[k] = [];
|
|
5227
|
-
unchangedFiles[k] = [];
|
|
5362
|
+
function toWiki(G, communities, outputDir, options) {
|
|
5363
|
+
const communityMap = toNumericMap(communities);
|
|
5364
|
+
(0, import_node_fs4.mkdirSync)(outputDir, { recursive: true });
|
|
5365
|
+
const labels = options?.communityLabels ? toNumericMap(options.communityLabels) : new Map([...communityMap.keys()].map((cid) => [cid, `Community ${cid}`]));
|
|
5366
|
+
const cohesion = toNumericMap(options?.cohesion);
|
|
5367
|
+
const godNodesData = options?.godNodesData ?? [];
|
|
5368
|
+
let count = 0;
|
|
5369
|
+
for (const [cid, nodes] of communityMap) {
|
|
5370
|
+
const label = labels.get(cid) ?? `Community ${cid}`;
|
|
5371
|
+
const article = communityArticle(G, cid, nodes, label, labels, cohesion.get(cid));
|
|
5372
|
+
(0, import_node_fs4.writeFileSync)((0, import_node_path3.join)(outputDir, `${safeFilename(label)}.md`), article);
|
|
5373
|
+
count++;
|
|
5228
5374
|
}
|
|
5229
|
-
for (const
|
|
5230
|
-
|
|
5231
|
-
|
|
5232
|
-
|
|
5233
|
-
|
|
5234
|
-
|
|
5235
|
-
} catch {
|
|
5236
|
-
}
|
|
5237
|
-
if (storedMtime === void 0 || currentMtime > storedMtime) {
|
|
5238
|
-
newFiles[ftype].push(f);
|
|
5239
|
-
} else {
|
|
5240
|
-
unchangedFiles[ftype].push(f);
|
|
5241
|
-
}
|
|
5375
|
+
for (const nodeData of godNodesData) {
|
|
5376
|
+
const nid = nodeData.id;
|
|
5377
|
+
if (nid && G.hasNode(nid)) {
|
|
5378
|
+
const article = godNodeArticle(G, nid, labels);
|
|
5379
|
+
(0, import_node_fs4.writeFileSync)((0, import_node_path3.join)(outputDir, `${safeFilename(nodeData.label)}.md`), article);
|
|
5380
|
+
count++;
|
|
5242
5381
|
}
|
|
5243
5382
|
}
|
|
5244
|
-
|
|
5245
|
-
|
|
5246
|
-
|
|
5247
|
-
|
|
5248
|
-
|
|
5249
|
-
incremental: true,
|
|
5250
|
-
new_files: newFiles,
|
|
5251
|
-
unchanged_files: unchangedFiles,
|
|
5252
|
-
new_total: newTotal,
|
|
5253
|
-
deleted_files: deletedFiles
|
|
5254
|
-
};
|
|
5383
|
+
(0, import_node_fs4.writeFileSync)(
|
|
5384
|
+
(0, import_node_path3.join)(outputDir, "index.md"),
|
|
5385
|
+
indexMd(communityMap, labels, godNodesData, G.order, G.size)
|
|
5386
|
+
);
|
|
5387
|
+
return count;
|
|
5255
5388
|
}
|
|
5256
5389
|
|
|
5257
5390
|
// src/index.ts
|
|
5391
|
+
init_detect();
|
|
5258
5392
|
init_extract();
|
|
5259
5393
|
init_cache();
|
|
5260
5394
|
init_security();
|
|
5261
5395
|
|
|
5262
5396
|
// src/benchmark.ts
|
|
5263
5397
|
var import_node_fs7 = require("fs");
|
|
5264
|
-
|
|
5398
|
+
init_graph();
|
|
5265
5399
|
var CHARS_PER_TOKEN = 4;
|
|
5266
5400
|
function estimateTokens(text) {
|
|
5267
5401
|
return Math.max(1, Math.floor(text.length / CHARS_PER_TOKEN));
|
|
@@ -5283,7 +5417,7 @@ function querySubgraphTokens(G, question, depth = 3) {
|
|
|
5283
5417
|
for (let d = 0; d < depth; d++) {
|
|
5284
5418
|
const nextFrontier = /* @__PURE__ */ new Set();
|
|
5285
5419
|
for (const n of frontier) {
|
|
5286
|
-
G
|
|
5420
|
+
forEachTraversalNeighbor(G, n, (neighbor) => {
|
|
5287
5421
|
if (!visited.has(neighbor)) {
|
|
5288
5422
|
nextFrontier.add(neighbor);
|
|
5289
5423
|
edgesSeen.push([n, neighbor]);
|
|
@@ -5318,21 +5452,7 @@ var SAMPLE_QUESTIONS = [
|
|
|
5318
5452
|
];
|
|
5319
5453
|
function loadGraph(graphPath) {
|
|
5320
5454
|
const raw = JSON.parse((0, import_node_fs7.readFileSync)(graphPath, "utf-8"));
|
|
5321
|
-
|
|
5322
|
-
for (const node of raw.nodes ?? []) {
|
|
5323
|
-
const { id, ...attrs } = node;
|
|
5324
|
-
G.mergeNode(id, attrs);
|
|
5325
|
-
}
|
|
5326
|
-
for (const link of raw.links ?? []) {
|
|
5327
|
-
const { source, target, ...attrs } = link;
|
|
5328
|
-
if (G.hasNode(source) && G.hasNode(target)) {
|
|
5329
|
-
try {
|
|
5330
|
-
G.mergeEdge(source, target, attrs);
|
|
5331
|
-
} catch {
|
|
5332
|
-
}
|
|
5333
|
-
}
|
|
5334
|
-
}
|
|
5335
|
-
return G;
|
|
5455
|
+
return loadGraphFromData(raw);
|
|
5336
5456
|
}
|
|
5337
5457
|
function runBenchmark(graphPath = "graphify-out/graph.json", corpusWordsOrOptions, questions) {
|
|
5338
5458
|
const options = typeof corpusWordsOrOptions === "number" ? { corpusWords: corpusWordsOrOptions, questions } : corpusWordsOrOptions ?? {};
|
|
@@ -5395,9 +5515,395 @@ graphify token reduction benchmark`);
|
|
|
5395
5515
|
}
|
|
5396
5516
|
|
|
5397
5517
|
// src/ingest.ts
|
|
5518
|
+
var import_node_fs9 = require("fs");
|
|
5519
|
+
var import_node_path7 = require("path");
|
|
5520
|
+
init_security();
|
|
5521
|
+
|
|
5522
|
+
// src/transcribe.ts
|
|
5523
|
+
var childProcess = __toESM(require("child_process"), 1);
|
|
5524
|
+
var import_node_crypto3 = require("crypto");
|
|
5398
5525
|
var import_node_fs8 = require("fs");
|
|
5526
|
+
var import_node_os = require("os");
|
|
5399
5527
|
var import_node_path6 = require("path");
|
|
5400
|
-
|
|
5528
|
+
var import_node_stream = require("stream");
|
|
5529
|
+
var import_promises = require("stream/promises");
|
|
5530
|
+
var URL_PREFIXES = ["http://", "https://", "www."];
|
|
5531
|
+
var CACHED_AUDIO_EXTENSIONS = [".m4a", ".opus", ".mp3", ".ogg", ".wav", ".webm"];
|
|
5532
|
+
var DEFAULT_MODEL = "base";
|
|
5533
|
+
var TRANSCRIPTS_DIR = "graphify-out/transcripts";
|
|
5534
|
+
var FALLBACK_PROMPT = "Use proper punctuation and paragraph breaks.";
|
|
5535
|
+
var SHERPA_RELEASE_BASE = "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models";
|
|
5536
|
+
var AUDIO_SAMPLE_RATE = 16e3;
|
|
5537
|
+
var SUPPORTED_MODELS = /* @__PURE__ */ new Set([
|
|
5538
|
+
"tiny",
|
|
5539
|
+
"tiny.en",
|
|
5540
|
+
"base",
|
|
5541
|
+
"base.en",
|
|
5542
|
+
"small",
|
|
5543
|
+
"small.en",
|
|
5544
|
+
"medium",
|
|
5545
|
+
"medium.en",
|
|
5546
|
+
"large-v1",
|
|
5547
|
+
"large-v2",
|
|
5548
|
+
"large-v3",
|
|
5549
|
+
"turbo",
|
|
5550
|
+
"distil-small.en",
|
|
5551
|
+
"distil-medium.en",
|
|
5552
|
+
"distil-large-v2",
|
|
5553
|
+
"distil-large-v3",
|
|
5554
|
+
"distil-large-v3.5"
|
|
5555
|
+
]);
|
|
5556
|
+
var MODEL_ALIASES = {
|
|
5557
|
+
large: "large-v3"
|
|
5558
|
+
};
|
|
5559
|
+
var recognizerCache = /* @__PURE__ */ new Map();
|
|
5560
|
+
var sherpaModulePromise = null;
|
|
5561
|
+
function runCommand(command, args, options) {
|
|
5562
|
+
const result = childProcess.spawnSync(command, args, {
|
|
5563
|
+
encoding: "utf-8",
|
|
5564
|
+
...options
|
|
5565
|
+
});
|
|
5566
|
+
if (result.error) {
|
|
5567
|
+
throw result.error;
|
|
5568
|
+
}
|
|
5569
|
+
if (result.status !== 0) {
|
|
5570
|
+
throw new Error(result.stderr?.trim() || result.stdout?.trim() || `${command} failed`);
|
|
5571
|
+
}
|
|
5572
|
+
return result;
|
|
5573
|
+
}
|
|
5574
|
+
function defaultWhisperCacheDir() {
|
|
5575
|
+
if (process.env.GRAPHIFY_WHISPER_CACHE_DIR) {
|
|
5576
|
+
return (0, import_node_path6.resolve)(process.env.GRAPHIFY_WHISPER_CACHE_DIR);
|
|
5577
|
+
}
|
|
5578
|
+
if ((0, import_node_os.platform)() === "win32") {
|
|
5579
|
+
return (0, import_node_path6.join)(
|
|
5580
|
+
process.env.LOCALAPPDATA ?? (0, import_node_path6.join)((0, import_node_os.homedir)(), "AppData", "Local"),
|
|
5581
|
+
"graphify",
|
|
5582
|
+
"whisper"
|
|
5583
|
+
);
|
|
5584
|
+
}
|
|
5585
|
+
return (0, import_node_path6.join)(process.env.XDG_CACHE_HOME ?? (0, import_node_path6.join)((0, import_node_os.homedir)(), ".cache"), "graphify", "whisper");
|
|
5586
|
+
}
|
|
5587
|
+
function ffmpegBinary() {
|
|
5588
|
+
return process.env.GRAPHIFY_FFMPEG_BIN ?? "ffmpeg";
|
|
5589
|
+
}
|
|
5590
|
+
function tarBinary() {
|
|
5591
|
+
return process.env.GRAPHIFY_TAR_BIN ?? "tar";
|
|
5592
|
+
}
|
|
5593
|
+
function resolveRequestedModel(modelName) {
|
|
5594
|
+
const requested = modelName ?? process.env.GRAPHIFY_WHISPER_MODEL ?? DEFAULT_MODEL;
|
|
5595
|
+
const resolved = MODEL_ALIASES[requested] ?? requested;
|
|
5596
|
+
if (!SUPPORTED_MODELS.has(resolved)) {
|
|
5597
|
+
throw new Error(
|
|
5598
|
+
`Unsupported GRAPHIFY_WHISPER_MODEL "${requested}". Supported local TS models: ${[...SUPPORTED_MODELS].sort().join(", ")}`
|
|
5599
|
+
);
|
|
5600
|
+
}
|
|
5601
|
+
return { requested, resolved };
|
|
5602
|
+
}
|
|
5603
|
+
function walkFiles(dir) {
|
|
5604
|
+
if (!(0, import_node_fs8.existsSync)(dir)) return [];
|
|
5605
|
+
const files = [];
|
|
5606
|
+
for (const entry of (0, import_node_fs8.readdirSync)(dir, { withFileTypes: true })) {
|
|
5607
|
+
const fullPath = (0, import_node_path6.join)(dir, entry.name);
|
|
5608
|
+
if (entry.isDirectory()) {
|
|
5609
|
+
files.push(...walkFiles(fullPath));
|
|
5610
|
+
} else {
|
|
5611
|
+
files.push(fullPath);
|
|
5612
|
+
}
|
|
5613
|
+
}
|
|
5614
|
+
return files;
|
|
5615
|
+
}
|
|
5616
|
+
function findArtifactsIn(dir) {
|
|
5617
|
+
const files = walkFiles(dir);
|
|
5618
|
+
const encoderPath = files.find((path) => path.endsWith("-encoder.int8.onnx")) ?? files.find((path) => path.endsWith("-encoder.onnx"));
|
|
5619
|
+
const decoderPath = files.find((path) => path.endsWith("-decoder.int8.onnx")) ?? files.find((path) => path.endsWith("-decoder.onnx"));
|
|
5620
|
+
const tokensPath = files.find((path) => path.endsWith("-tokens.txt"));
|
|
5621
|
+
if (!encoderPath || !decoderPath || !tokensPath) {
|
|
5622
|
+
return null;
|
|
5623
|
+
}
|
|
5624
|
+
return {
|
|
5625
|
+
modelDir: dir,
|
|
5626
|
+
encoderPath,
|
|
5627
|
+
decoderPath,
|
|
5628
|
+
tokensPath
|
|
5629
|
+
};
|
|
5630
|
+
}
|
|
5631
|
+
function normalizeModelError(detail) {
|
|
5632
|
+
if (detail.includes("404")) {
|
|
5633
|
+
return `${detail}. The local sherpa-onnx release asset was not found for this Whisper model name.`;
|
|
5634
|
+
}
|
|
5635
|
+
return detail;
|
|
5636
|
+
}
|
|
5637
|
+
async function writeResponseToFile(response, destination) {
|
|
5638
|
+
if (!response.ok || !response.body) {
|
|
5639
|
+
throw new Error(`HTTP ${response.status} while downloading ${response.url}`);
|
|
5640
|
+
}
|
|
5641
|
+
await (0, import_promises.pipeline)(import_node_stream.Readable.fromWeb(response.body), (0, import_node_fs8.createWriteStream)(destination));
|
|
5642
|
+
}
|
|
5643
|
+
async function ensureWhisperArtifacts(modelName) {
|
|
5644
|
+
const { requested, resolved } = resolveRequestedModel(modelName);
|
|
5645
|
+
const cacheRoot = defaultWhisperCacheDir();
|
|
5646
|
+
(0, import_node_fs8.mkdirSync)(cacheRoot, { recursive: true });
|
|
5647
|
+
const modelDir = (0, import_node_path6.join)(cacheRoot, `sherpa-onnx-whisper-${resolved}`);
|
|
5648
|
+
const cached = findArtifactsIn(modelDir);
|
|
5649
|
+
if (cached) {
|
|
5650
|
+
return { requestedModel: requested, resolvedModel: resolved, ...cached };
|
|
5651
|
+
}
|
|
5652
|
+
const tempDir = (0, import_node_fs8.mkdtempSync)((0, import_node_path6.join)((0, import_node_os.tmpdir)(), "graphify-whisper-model-"));
|
|
5653
|
+
const extractDir = (0, import_node_path6.join)(tempDir, "extract");
|
|
5654
|
+
const archiveName = `sherpa-onnx-whisper-${resolved}.tar.bz2`;
|
|
5655
|
+
const archivePath = (0, import_node_path6.join)(tempDir, archiveName);
|
|
5656
|
+
(0, import_node_fs8.mkdirSync)(extractDir, { recursive: true });
|
|
5657
|
+
try {
|
|
5658
|
+
const url = `${SHERPA_RELEASE_BASE}/${archiveName}`;
|
|
5659
|
+
console.log(` downloading whisper model: ${resolved}`);
|
|
5660
|
+
const response = await fetch(url);
|
|
5661
|
+
await writeResponseToFile(response, archivePath);
|
|
5662
|
+
runCommand(tarBinary(), ["-xjf", archivePath, "-C", extractDir]);
|
|
5663
|
+
const extractedRoot = walkFiles(extractDir).map((path) => (0, import_node_path6.dirname)(path)).find((path) => findArtifactsIn(path) !== null);
|
|
5664
|
+
const sourceDir = extractedRoot ?? (0, import_node_fs8.readdirSync)(extractDir, { withFileTypes: true }).filter((entry) => entry.isDirectory()).map((entry) => (0, import_node_path6.join)(extractDir, entry.name)).find((path) => findArtifactsIn(path) !== null);
|
|
5665
|
+
if (!sourceDir) {
|
|
5666
|
+
throw new Error(`Downloaded archive for ${resolved} but could not locate Whisper model files`);
|
|
5667
|
+
}
|
|
5668
|
+
if ((0, import_node_fs8.existsSync)(modelDir)) {
|
|
5669
|
+
(0, import_node_fs8.rmSync)(modelDir, { recursive: true, force: true });
|
|
5670
|
+
}
|
|
5671
|
+
try {
|
|
5672
|
+
(0, import_node_fs8.renameSync)(sourceDir, modelDir);
|
|
5673
|
+
} catch {
|
|
5674
|
+
(0, import_node_fs8.cpSync)(sourceDir, modelDir, { recursive: true });
|
|
5675
|
+
}
|
|
5676
|
+
const artifacts = findArtifactsIn(modelDir);
|
|
5677
|
+
if (!artifacts) {
|
|
5678
|
+
throw new Error(`Model cache for ${resolved} is incomplete after extraction`);
|
|
5679
|
+
}
|
|
5680
|
+
return { requestedModel: requested, resolvedModel: resolved, ...artifacts };
|
|
5681
|
+
} catch (error) {
|
|
5682
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
5683
|
+
throw new Error(normalizeModelError(detail));
|
|
5684
|
+
} finally {
|
|
5685
|
+
(0, import_node_fs8.rmSync)(tempDir, { recursive: true, force: true });
|
|
5686
|
+
}
|
|
5687
|
+
}
|
|
5688
|
+
async function loadSherpaModule() {
|
|
5689
|
+
if (!sherpaModulePromise) {
|
|
5690
|
+
sherpaModulePromise = import("sherpa-onnx-node").then((imported) => Reflect.has(imported, "default") ? Reflect.get(imported, "default") : imported).catch((error) => {
|
|
5691
|
+
sherpaModulePromise = null;
|
|
5692
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
5693
|
+
throw new Error(
|
|
5694
|
+
`Video transcription requires the optional dependency sherpa-onnx-node. Install it locally, then retry. ${detail}`
|
|
5695
|
+
);
|
|
5696
|
+
});
|
|
5697
|
+
}
|
|
5698
|
+
return sherpaModulePromise;
|
|
5699
|
+
}
|
|
5700
|
+
async function getRecognizer(modelName, sherpa) {
|
|
5701
|
+
const artifacts = await ensureWhisperArtifacts(modelName);
|
|
5702
|
+
const cacheKey = artifacts.modelDir;
|
|
5703
|
+
const existing = recognizerCache.get(cacheKey);
|
|
5704
|
+
if (existing) {
|
|
5705
|
+
return { recognizer: await existing, artifacts };
|
|
5706
|
+
}
|
|
5707
|
+
const createRecognizer = (async () => {
|
|
5708
|
+
const runtime = sherpa ?? await loadSherpaModule();
|
|
5709
|
+
return runtime.OfflineRecognizer.createAsync({
|
|
5710
|
+
featConfig: {
|
|
5711
|
+
sampleRate: AUDIO_SAMPLE_RATE,
|
|
5712
|
+
featureDim: 80
|
|
5713
|
+
},
|
|
5714
|
+
modelConfig: {
|
|
5715
|
+
whisper: {
|
|
5716
|
+
encoder: artifacts.encoderPath,
|
|
5717
|
+
decoder: artifacts.decoderPath,
|
|
5718
|
+
task: "transcribe"
|
|
5719
|
+
},
|
|
5720
|
+
tokens: artifacts.tokensPath,
|
|
5721
|
+
numThreads: 1,
|
|
5722
|
+
provider: "cpu",
|
|
5723
|
+
debug: 0
|
|
5724
|
+
}
|
|
5725
|
+
});
|
|
5726
|
+
})();
|
|
5727
|
+
recognizerCache.set(
|
|
5728
|
+
cacheKey,
|
|
5729
|
+
createRecognizer.catch((error) => {
|
|
5730
|
+
recognizerCache.delete(cacheKey);
|
|
5731
|
+
throw error;
|
|
5732
|
+
})
|
|
5733
|
+
);
|
|
5734
|
+
return { recognizer: await recognizerCache.get(cacheKey), artifacts };
|
|
5735
|
+
}
|
|
5736
|
+
function normalizeToWave(audioPath, workingDir) {
|
|
5737
|
+
const wavPath = (0, import_node_path6.join)(workingDir, `${(0, import_node_path6.basename)(audioPath, (0, import_node_path6.extname)(audioPath))}.wav`);
|
|
5738
|
+
try {
|
|
5739
|
+
runCommand(ffmpegBinary(), [
|
|
5740
|
+
"-y",
|
|
5741
|
+
"-i",
|
|
5742
|
+
audioPath,
|
|
5743
|
+
"-vn",
|
|
5744
|
+
"-ac",
|
|
5745
|
+
"1",
|
|
5746
|
+
"-ar",
|
|
5747
|
+
String(AUDIO_SAMPLE_RATE),
|
|
5748
|
+
"-c:a",
|
|
5749
|
+
"pcm_s16le",
|
|
5750
|
+
wavPath
|
|
5751
|
+
]);
|
|
5752
|
+
} catch (error) {
|
|
5753
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
5754
|
+
throw new Error(
|
|
5755
|
+
`Video transcription requires ffmpeg in PATH. Install ffmpeg locally, then retry. ${detail}`
|
|
5756
|
+
);
|
|
5757
|
+
}
|
|
5758
|
+
return wavPath;
|
|
5759
|
+
}
|
|
5760
|
+
function extractTranscriptText(result) {
|
|
5761
|
+
return String(result.text ?? "").trim();
|
|
5762
|
+
}
|
|
5763
|
+
function isUrl(pathLike) {
|
|
5764
|
+
return URL_PREFIXES.some((prefix) => pathLike.startsWith(prefix));
|
|
5765
|
+
}
|
|
5766
|
+
function downloadAudio(url, outputDir) {
|
|
5767
|
+
(0, import_node_fs8.mkdirSync)(outputDir, { recursive: true });
|
|
5768
|
+
const urlHash = (0, import_node_crypto3.createHash)("sha1").update(url).digest("hex").slice(0, 12);
|
|
5769
|
+
for (const ext of CACHED_AUDIO_EXTENSIONS) {
|
|
5770
|
+
const candidate = (0, import_node_path6.join)(outputDir, `yt_${urlHash}${ext}`);
|
|
5771
|
+
if ((0, import_node_fs8.existsSync)(candidate)) {
|
|
5772
|
+
console.log(` cached audio: ${(0, import_node_path6.basename)(candidate)}`);
|
|
5773
|
+
return candidate;
|
|
5774
|
+
}
|
|
5775
|
+
}
|
|
5776
|
+
const outTemplate = (0, import_node_path6.join)(outputDir, `yt_${urlHash}.%(ext)s`);
|
|
5777
|
+
try {
|
|
5778
|
+
console.log(` downloading audio: ${url.slice(0, 80)} ...`);
|
|
5779
|
+
runCommand("yt-dlp", [
|
|
5780
|
+
"-f",
|
|
5781
|
+
"bestaudio[ext=m4a]/bestaudio/best",
|
|
5782
|
+
"-o",
|
|
5783
|
+
outTemplate,
|
|
5784
|
+
"--quiet",
|
|
5785
|
+
"--no-warnings",
|
|
5786
|
+
"--no-playlist",
|
|
5787
|
+
url
|
|
5788
|
+
]);
|
|
5789
|
+
} catch (error) {
|
|
5790
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
5791
|
+
throw new Error(
|
|
5792
|
+
`YouTube/URL download requires yt-dlp. Install yt-dlp to enable video ingestion. ${detail}`
|
|
5793
|
+
);
|
|
5794
|
+
}
|
|
5795
|
+
for (const entry of (0, import_node_fs8.readdirSync)(outputDir)) {
|
|
5796
|
+
if (entry.startsWith(`yt_${urlHash}.`)) {
|
|
5797
|
+
return (0, import_node_path6.join)(outputDir, entry);
|
|
5798
|
+
}
|
|
5799
|
+
}
|
|
5800
|
+
throw new Error(`yt-dlp finished without producing an audio file for ${url}`);
|
|
5801
|
+
}
|
|
5802
|
+
function buildWhisperPrompt(godNodes2) {
|
|
5803
|
+
const override = process.env.GRAPHIFY_WHISPER_PROMPT;
|
|
5804
|
+
if (override) return override;
|
|
5805
|
+
const labels = godNodes2.map((node) => node.label ?? "").filter((label) => Boolean(label)).slice(0, 5);
|
|
5806
|
+
if (labels.length === 0) {
|
|
5807
|
+
return FALLBACK_PROMPT;
|
|
5808
|
+
}
|
|
5809
|
+
return `Technical discussion about ${labels.join(", ")}. ${FALLBACK_PROMPT}`;
|
|
5810
|
+
}
|
|
5811
|
+
async function transcribe(videoPath, outputDir = TRANSCRIPTS_DIR, initialPrompt, force = false) {
|
|
5812
|
+
const outDir = (0, import_node_path6.resolve)(outputDir);
|
|
5813
|
+
(0, import_node_fs8.mkdirSync)(outDir, { recursive: true });
|
|
5814
|
+
const audioPath = isUrl(videoPath) ? downloadAudio(videoPath, (0, import_node_path6.join)(outDir, "downloads")) : (0, import_node_path6.resolve)(videoPath);
|
|
5815
|
+
const transcriptPath = (0, import_node_path6.join)(outDir, `${(0, import_node_path6.basename)(audioPath, (0, import_node_path6.extname)(audioPath))}.txt`);
|
|
5816
|
+
if ((0, import_node_fs8.existsSync)(transcriptPath) && !force) {
|
|
5817
|
+
return transcriptPath;
|
|
5818
|
+
}
|
|
5819
|
+
const prompt = initialPrompt ?? process.env.GRAPHIFY_WHISPER_PROMPT ?? FALLBACK_PROMPT;
|
|
5820
|
+
const requestedModel = process.env.GRAPHIFY_WHISPER_MODEL ?? DEFAULT_MODEL;
|
|
5821
|
+
const tempDir = (0, import_node_fs8.mkdtempSync)((0, import_node_path6.join)((0, import_node_os.tmpdir)(), "graphify-transcribe-"));
|
|
5822
|
+
try {
|
|
5823
|
+
console.log(` transcribing ${(0, import_node_path6.basename)(audioPath)} (model=${requestedModel}) ...`);
|
|
5824
|
+
const wavPath = normalizeToWave(audioPath, tempDir);
|
|
5825
|
+
const sherpa = await loadSherpaModule();
|
|
5826
|
+
const { recognizer, artifacts } = await getRecognizer(requestedModel, sherpa);
|
|
5827
|
+
const wave = sherpa.readWave(wavPath);
|
|
5828
|
+
const stream = recognizer.createStream();
|
|
5829
|
+
if (prompt && typeof stream.setOption === "function") {
|
|
5830
|
+
try {
|
|
5831
|
+
stream.setOption("prompt", prompt);
|
|
5832
|
+
} catch {
|
|
5833
|
+
}
|
|
5834
|
+
}
|
|
5835
|
+
stream.acceptWaveform({ samples: wave.samples, sampleRate: wave.sampleRate });
|
|
5836
|
+
const result = await recognizer.decodeAsync(stream);
|
|
5837
|
+
const transcript = extractTranscriptText(result);
|
|
5838
|
+
(0, import_node_fs8.writeFileSync)(transcriptPath, transcript, "utf-8");
|
|
5839
|
+
if (artifacts.requestedModel !== artifacts.resolvedModel) {
|
|
5840
|
+
console.log(` model alias: ${artifacts.requestedModel} -> ${artifacts.resolvedModel}`);
|
|
5841
|
+
}
|
|
5842
|
+
} catch (error) {
|
|
5843
|
+
if (error instanceof Error && error.message.startsWith("Unsupported GRAPHIFY_WHISPER_MODEL")) {
|
|
5844
|
+
throw error;
|
|
5845
|
+
}
|
|
5846
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
5847
|
+
throw new Error(
|
|
5848
|
+
`Video transcription requires the local TypeScript toolchain: sherpa-onnx-node + ffmpeg. Retry after installing them. ${detail}`
|
|
5849
|
+
);
|
|
5850
|
+
} finally {
|
|
5851
|
+
(0, import_node_fs8.rmSync)(tempDir, { recursive: true, force: true });
|
|
5852
|
+
}
|
|
5853
|
+
return transcriptPath;
|
|
5854
|
+
}
|
|
5855
|
+
async function transcribeAll(videoFiles, outputDir, initialPrompt, force = false) {
|
|
5856
|
+
if (videoFiles.length === 0) {
|
|
5857
|
+
return [];
|
|
5858
|
+
}
|
|
5859
|
+
const transcriptPaths = [];
|
|
5860
|
+
for (const videoFile of videoFiles) {
|
|
5861
|
+
try {
|
|
5862
|
+
transcriptPaths.push(await transcribe(videoFile, outputDir, initialPrompt, force));
|
|
5863
|
+
} catch (error) {
|
|
5864
|
+
const detail = error instanceof Error ? error.message : String(error);
|
|
5865
|
+
console.log(` warning: could not transcribe ${videoFile}: ${detail}`);
|
|
5866
|
+
}
|
|
5867
|
+
}
|
|
5868
|
+
return transcriptPaths;
|
|
5869
|
+
}
|
|
5870
|
+
function cloneDetection(detection) {
|
|
5871
|
+
return JSON.parse(JSON.stringify(detection));
|
|
5872
|
+
}
|
|
5873
|
+
async function augmentDetectionWithTranscripts(detection, options) {
|
|
5874
|
+
const nextDetection = cloneDetection(detection);
|
|
5875
|
+
const source = options?.incremental && nextDetection.new_files ? nextDetection.new_files : nextDetection.files;
|
|
5876
|
+
const videoFiles = [...source.video ?? []];
|
|
5877
|
+
const prompt = options?.initialPrompt ?? buildWhisperPrompt(options?.godNodes ?? []);
|
|
5878
|
+
if (videoFiles.length === 0) {
|
|
5879
|
+
return { detection: nextDetection, transcriptPaths: [], prompt };
|
|
5880
|
+
}
|
|
5881
|
+
const previousModel = process.env.GRAPHIFY_WHISPER_MODEL;
|
|
5882
|
+
if (options?.whisperModel) {
|
|
5883
|
+
process.env.GRAPHIFY_WHISPER_MODEL = options.whisperModel;
|
|
5884
|
+
}
|
|
5885
|
+
try {
|
|
5886
|
+
const transcriptPaths = await transcribeAll(
|
|
5887
|
+
videoFiles,
|
|
5888
|
+
options?.outputDir,
|
|
5889
|
+
prompt,
|
|
5890
|
+
options?.incremental === true
|
|
5891
|
+
);
|
|
5892
|
+
const existingDocuments = source.document ?? [];
|
|
5893
|
+
source.document = [...existingDocuments, ...transcriptPaths];
|
|
5894
|
+
return { detection: nextDetection, transcriptPaths, prompt };
|
|
5895
|
+
} finally {
|
|
5896
|
+
if (options?.whisperModel) {
|
|
5897
|
+
if (previousModel === void 0) {
|
|
5898
|
+
delete process.env.GRAPHIFY_WHISPER_MODEL;
|
|
5899
|
+
} else {
|
|
5900
|
+
process.env.GRAPHIFY_WHISPER_MODEL = previousModel;
|
|
5901
|
+
}
|
|
5902
|
+
}
|
|
5903
|
+
}
|
|
5904
|
+
}
|
|
5905
|
+
|
|
5906
|
+
// src/ingest.ts
|
|
5401
5907
|
function yamlStr(s) {
|
|
5402
5908
|
return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, " ").replace(/\r/g, " ");
|
|
5403
5909
|
}
|
|
@@ -5563,9 +6069,9 @@ Source: ${url}
|
|
|
5563
6069
|
}
|
|
5564
6070
|
async function downloadBinary(url, suffix, targetDir) {
|
|
5565
6071
|
const filename = safeFilename2(url, suffix);
|
|
5566
|
-
const outPath = (0,
|
|
6072
|
+
const outPath = (0, import_node_path7.resolve)(targetDir, filename);
|
|
5567
6073
|
const data = await safeFetch(url);
|
|
5568
|
-
(0,
|
|
6074
|
+
(0, import_node_fs9.writeFileSync)(outPath, data);
|
|
5569
6075
|
return outPath;
|
|
5570
6076
|
}
|
|
5571
6077
|
function normalizeIngestOptions(authorOrOptions, contributor) {
|
|
@@ -5581,7 +6087,7 @@ function normalizeIngestOptions(authorOrOptions, contributor) {
|
|
|
5581
6087
|
};
|
|
5582
6088
|
}
|
|
5583
6089
|
async function ingest(url, targetDir, authorOrOptions = null, contributor = null) {
|
|
5584
|
-
(0,
|
|
6090
|
+
(0, import_node_fs9.mkdirSync)(targetDir, { recursive: true });
|
|
5585
6091
|
const urlType = detectUrlType(url);
|
|
5586
6092
|
const { author, contributor: normalizedContributor } = normalizeIngestOptions(
|
|
5587
6093
|
authorOrOptions,
|
|
@@ -5592,7 +6098,7 @@ async function ingest(url, targetDir, authorOrOptions = null, contributor = null
|
|
|
5592
6098
|
let filename;
|
|
5593
6099
|
if (urlType === "pdf") {
|
|
5594
6100
|
const out = await downloadBinary(url, ".pdf", targetDir);
|
|
5595
|
-
console.log(`Downloaded PDF: ${(0,
|
|
6101
|
+
console.log(`Downloaded PDF: ${(0, import_node_path7.basename)(out)}`);
|
|
5596
6102
|
return out;
|
|
5597
6103
|
}
|
|
5598
6104
|
if (urlType === "image") {
|
|
@@ -5602,9 +6108,14 @@ async function ingest(url, targetDir, authorOrOptions = null, contributor = null
|
|
|
5602
6108
|
} catch {
|
|
5603
6109
|
throw new Error(`Invalid URL: ${url}`);
|
|
5604
6110
|
}
|
|
5605
|
-
const suffix = (0,
|
|
6111
|
+
const suffix = (0, import_node_path7.extname)(parsed.pathname) || ".jpg";
|
|
5606
6112
|
const out = await downloadBinary(url, suffix, targetDir);
|
|
5607
|
-
console.log(`Downloaded image: ${(0,
|
|
6113
|
+
console.log(`Downloaded image: ${(0, import_node_path7.basename)(out)}`);
|
|
6114
|
+
return out;
|
|
6115
|
+
}
|
|
6116
|
+
if (urlType === "youtube") {
|
|
6117
|
+
const out = downloadAudio(url, targetDir);
|
|
6118
|
+
console.log(`Downloaded audio: ${(0, import_node_path7.basename)(out)}`);
|
|
5608
6119
|
return out;
|
|
5609
6120
|
}
|
|
5610
6121
|
if (urlType === "tweet") {
|
|
@@ -5614,15 +6125,15 @@ async function ingest(url, targetDir, authorOrOptions = null, contributor = null
|
|
|
5614
6125
|
} else {
|
|
5615
6126
|
[content, filename] = await fetchWebpage(url, author, normalizedContributor);
|
|
5616
6127
|
}
|
|
5617
|
-
let outPath = (0,
|
|
6128
|
+
let outPath = (0, import_node_path7.resolve)(targetDir, filename);
|
|
5618
6129
|
let counter = 1;
|
|
5619
|
-
while ((0,
|
|
6130
|
+
while ((0, import_node_fs9.existsSync)(outPath)) {
|
|
5620
6131
|
const stem = filename.replace(/\.md$/, "");
|
|
5621
|
-
outPath = (0,
|
|
6132
|
+
outPath = (0, import_node_path7.resolve)(targetDir, `${stem}_${counter}.md`);
|
|
5622
6133
|
counter++;
|
|
5623
6134
|
}
|
|
5624
|
-
(0,
|
|
5625
|
-
console.log(`Saved ${urlType}: ${(0,
|
|
6135
|
+
(0, import_node_fs9.writeFileSync)(outPath, content, "utf-8");
|
|
6136
|
+
console.log(`Saved ${urlType}: ${(0, import_node_path7.basename)(outPath)}`);
|
|
5626
6137
|
return outPath;
|
|
5627
6138
|
}
|
|
5628
6139
|
function saveQueryResult(questionOrOptions, answer, memoryDir, queryType = "query", sourceNodes = null) {
|
|
@@ -5642,7 +6153,7 @@ function saveQueryResult(questionOrOptions, answer, memoryDir, queryType = "quer
|
|
|
5642
6153
|
if (!payload.question) throw new Error("saveQueryResult requires a question");
|
|
5643
6154
|
if (!payload.memoryDir) throw new Error("saveQueryResult requires a memoryDir");
|
|
5644
6155
|
const effectiveAnswer = payload.answer ?? "";
|
|
5645
|
-
(0,
|
|
6156
|
+
(0, import_node_fs9.mkdirSync)(payload.memoryDir, { recursive: true });
|
|
5646
6157
|
const now = /* @__PURE__ */ new Date();
|
|
5647
6158
|
const slug = payload.question.toLowerCase().replace(/[^\w]/g, "_").slice(0, 50).replace(/_+$/, "");
|
|
5648
6159
|
const ts = now.toISOString().replace(/[-:]/g, "").replace("T", "_").slice(0, 15);
|
|
@@ -5674,11 +6185,11 @@ function saveQueryResult(questionOrOptions, answer, memoryDir, queryType = "quer
|
|
|
5674
6185
|
}
|
|
5675
6186
|
}
|
|
5676
6187
|
const content = [...frontmatterLines, ...bodyLines].join("\n");
|
|
5677
|
-
const outPath = (0,
|
|
5678
|
-
(0,
|
|
6188
|
+
const outPath = (0, import_node_path7.resolve)(payload.memoryDir, filename);
|
|
6189
|
+
(0, import_node_fs9.writeFileSync)(outPath, content, "utf-8");
|
|
5679
6190
|
return outPath;
|
|
5680
6191
|
}
|
|
5681
|
-
var isDirectExecution = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^ingest\.(?:js|mjs|cjs|ts)$/.test((0,
|
|
6192
|
+
var isDirectExecution = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^ingest\.(?:js|mjs|cjs|ts)$/.test((0, import_node_path7.basename)(process.argv[1]));
|
|
5682
6193
|
if (isDirectExecution) {
|
|
5683
6194
|
const url = process.argv[2];
|
|
5684
6195
|
const targetDir = process.argv[3] ?? "./raw";
|
|
@@ -5694,44 +6205,30 @@ if (isDirectExecution) {
|
|
|
5694
6205
|
}
|
|
5695
6206
|
|
|
5696
6207
|
// src/serve.ts
|
|
5697
|
-
var
|
|
5698
|
-
var import_graphology3 = __toESM(require("graphology"), 1);
|
|
6208
|
+
var import_node_fs10 = require("fs");
|
|
5699
6209
|
var import_unweighted = require("graphology-shortest-path/unweighted.js");
|
|
5700
|
-
var
|
|
6210
|
+
var import_node_path8 = require("path");
|
|
6211
|
+
init_graph();
|
|
5701
6212
|
init_security();
|
|
5702
6213
|
init_analyze();
|
|
5703
6214
|
function loadGraph2(graphPath) {
|
|
5704
6215
|
let safePath;
|
|
5705
6216
|
try {
|
|
5706
|
-
safePath = validateGraphPath(graphPath);
|
|
6217
|
+
safePath = validateGraphPath(graphPath, (0, import_node_path8.dirname)((0, import_node_path8.resolve)(graphPath)));
|
|
5707
6218
|
} catch (err) {
|
|
5708
6219
|
console.error(`error: ${err instanceof Error ? err.message : err}`);
|
|
5709
6220
|
process.exit(1);
|
|
5710
6221
|
}
|
|
5711
6222
|
let data;
|
|
5712
6223
|
try {
|
|
5713
|
-
data = JSON.parse((0,
|
|
6224
|
+
data = JSON.parse((0, import_node_fs10.readFileSync)(safePath, "utf-8"));
|
|
5714
6225
|
} catch (err) {
|
|
5715
6226
|
console.error(
|
|
5716
6227
|
`error: graph.json is corrupted (${err instanceof Error ? err.message : err}). Re-run the graphify skill to rebuild it (for Codex: $graphify .).`
|
|
5717
6228
|
);
|
|
5718
6229
|
process.exit(1);
|
|
5719
6230
|
}
|
|
5720
|
-
|
|
5721
|
-
const nodes = data.nodes ?? [];
|
|
5722
|
-
for (const node of nodes) {
|
|
5723
|
-
const { id, ...attrs } = node;
|
|
5724
|
-
G.mergeNode(id, attrs);
|
|
5725
|
-
}
|
|
5726
|
-
const links = data.links ?? data.edges ?? [];
|
|
5727
|
-
for (const link of links) {
|
|
5728
|
-
const { source, target, ...attrs } = link;
|
|
5729
|
-
try {
|
|
5730
|
-
G.mergeEdge(source, target, attrs);
|
|
5731
|
-
} catch {
|
|
5732
|
-
}
|
|
5733
|
-
}
|
|
5734
|
-
return G;
|
|
6231
|
+
return loadGraphFromData(data);
|
|
5735
6232
|
}
|
|
5736
6233
|
function communitiesFromGraph(G) {
|
|
5737
6234
|
const communities = /* @__PURE__ */ new Map();
|
|
@@ -5744,6 +6241,15 @@ function communitiesFromGraph(G) {
|
|
|
5744
6241
|
});
|
|
5745
6242
|
return communities;
|
|
5746
6243
|
}
|
|
6244
|
+
function communityName(G, cid) {
|
|
6245
|
+
if (cid === void 0 || cid === null) return null;
|
|
6246
|
+
const labels = G.getAttribute("community_labels");
|
|
6247
|
+
const fromGraph = labels?.[String(cid)];
|
|
6248
|
+
if (typeof fromGraph === "string" && fromGraph.length > 0) {
|
|
6249
|
+
return sanitizeLabel(fromGraph);
|
|
6250
|
+
}
|
|
6251
|
+
return null;
|
|
6252
|
+
}
|
|
5747
6253
|
function scoreNodes(G, terms) {
|
|
5748
6254
|
const scored = [];
|
|
5749
6255
|
G.forEachNode((nid, data) => {
|
|
@@ -5762,7 +6268,7 @@ function bfs(G, startNodes, depth) {
|
|
|
5762
6268
|
for (let i = 0; i < depth; i++) {
|
|
5763
6269
|
const nextFrontier = /* @__PURE__ */ new Set();
|
|
5764
6270
|
for (const n of frontier) {
|
|
5765
|
-
G
|
|
6271
|
+
forEachTraversalNeighbor(G, n, (neighbor) => {
|
|
5766
6272
|
if (!visited.has(neighbor)) {
|
|
5767
6273
|
nextFrontier.add(neighbor);
|
|
5768
6274
|
edges.push([n, neighbor]);
|
|
@@ -5782,7 +6288,7 @@ function dfs(G, startNodes, depth) {
|
|
|
5782
6288
|
const [node, d] = stack.pop();
|
|
5783
6289
|
if (visited.has(node) || d > depth) continue;
|
|
5784
6290
|
visited.add(node);
|
|
5785
|
-
G
|
|
6291
|
+
forEachTraversalNeighbor(G, node, (neighbor) => {
|
|
5786
6292
|
if (!visited.has(neighbor)) {
|
|
5787
6293
|
stack.push([neighbor, d + 1]);
|
|
5788
6294
|
edges.push([node, neighbor]);
|
|
@@ -5861,7 +6367,7 @@ function toolGetNode(G, args) {
|
|
|
5861
6367
|
` ID: ${nid}`,
|
|
5862
6368
|
` Source: ${d.source_file ?? ""} ${d.source_location ?? ""}`,
|
|
5863
6369
|
` Type: ${d.file_type ?? ""}`,
|
|
5864
|
-
` Community: ${d.community ?? ""}`,
|
|
6370
|
+
` Community: ${d.community_name ? `${d.community ?? ""} (${d.community_name})` : communityName(G, d.community) ?? String(d.community ?? "")}`,
|
|
5865
6371
|
` Degree: ${G.degree(nid)}`
|
|
5866
6372
|
].join("\n");
|
|
5867
6373
|
}
|
|
@@ -5872,7 +6378,7 @@ function toolGetNeighbors(G, args) {
|
|
|
5872
6378
|
if (matches.length === 0) return `No node matching '${label}' found.`;
|
|
5873
6379
|
const nid = matches[0];
|
|
5874
6380
|
const lines = [`Neighbors of ${G.getNodeAttribute(nid, "label") ?? nid}:`];
|
|
5875
|
-
G
|
|
6381
|
+
forEachTraversalNeighbor(G, nid, (neighbor) => {
|
|
5876
6382
|
const edgeKey = G.edge(nid, neighbor);
|
|
5877
6383
|
if (!edgeKey) return;
|
|
5878
6384
|
const d = G.getEdgeAttributes(edgeKey);
|
|
@@ -5888,7 +6394,8 @@ function toolGetCommunity(communities, G, args) {
|
|
|
5888
6394
|
const cid = Number(args.community_id);
|
|
5889
6395
|
const nodes = communities.get(cid);
|
|
5890
6396
|
if (!nodes || nodes.length === 0) return `Community ${cid} not found.`;
|
|
5891
|
-
const
|
|
6397
|
+
const label = communityName(G, cid);
|
|
6398
|
+
const lines = [label ? `Community ${cid} - ${label} (${nodes.length} nodes):` : `Community ${cid} (${nodes.length} nodes):`];
|
|
5892
6399
|
for (const n of nodes) {
|
|
5893
6400
|
const d = G.getNodeAttributes(n);
|
|
5894
6401
|
lines.push(` ${d.label ?? n} [${d.source_file ?? ""}]`);
|
|
@@ -6110,8 +6617,13 @@ async function serve(graphPath = "graphify-out/graph.json", transport) {
|
|
|
6110
6617
|
if (!handler) {
|
|
6111
6618
|
return { content: [{ type: "text", text: `Unknown tool: ${name}` }] };
|
|
6112
6619
|
}
|
|
6113
|
-
|
|
6114
|
-
|
|
6620
|
+
try {
|
|
6621
|
+
const text = handler(args ?? {});
|
|
6622
|
+
return { content: [{ type: "text", text }] };
|
|
6623
|
+
} catch (err) {
|
|
6624
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
6625
|
+
return { content: [{ type: "text", text: `Error executing ${name}: ${message}` }] };
|
|
6626
|
+
}
|
|
6115
6627
|
});
|
|
6116
6628
|
const serverTransport = transport ?? new StdioServerTransport();
|
|
6117
6629
|
let keepAlive;
|
|
@@ -6119,14 +6631,14 @@ async function serve(graphPath = "graphify-out/graph.json", transport) {
|
|
|
6119
6631
|
keepAlive = setInterval(() => void 0, 6e4);
|
|
6120
6632
|
process.stdin?.resume();
|
|
6121
6633
|
}
|
|
6122
|
-
const closed = new Promise((
|
|
6634
|
+
const closed = new Promise((resolve8) => {
|
|
6123
6635
|
const previousOnClose = server.onclose;
|
|
6124
6636
|
server.onclose = () => {
|
|
6125
6637
|
if (keepAlive) {
|
|
6126
6638
|
clearInterval(keepAlive);
|
|
6127
6639
|
}
|
|
6128
6640
|
previousOnClose?.();
|
|
6129
|
-
|
|
6641
|
+
resolve8();
|
|
6130
6642
|
};
|
|
6131
6643
|
});
|
|
6132
6644
|
await server.connect(serverTransport);
|
|
@@ -6134,7 +6646,7 @@ async function serve(graphPath = "graphify-out/graph.json", transport) {
|
|
|
6134
6646
|
await closed;
|
|
6135
6647
|
}
|
|
6136
6648
|
}
|
|
6137
|
-
var isDirectExecution2 = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^serve\.(?:js|mjs|cjs|ts)$/.test((0,
|
|
6649
|
+
var isDirectExecution2 = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^serve\.(?:js|mjs|cjs|ts)$/.test((0, import_node_path8.basename)(process.argv[1]));
|
|
6138
6650
|
if (isDirectExecution2) {
|
|
6139
6651
|
const graphPath = process.argv[2] ?? "graphify-out/graph.json";
|
|
6140
6652
|
serve(graphPath).catch((err) => {
|
|
@@ -6144,59 +6656,14 @@ if (isDirectExecution2) {
|
|
|
6144
6656
|
}
|
|
6145
6657
|
|
|
6146
6658
|
// src/watch.ts
|
|
6147
|
-
var
|
|
6148
|
-
var
|
|
6659
|
+
var import_node_fs11 = require("fs");
|
|
6660
|
+
var import_node_path9 = require("path");
|
|
6661
|
+
init_detect();
|
|
6149
6662
|
var WATCHED_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
6150
|
-
|
|
6151
|
-
|
|
6152
|
-
|
|
6153
|
-
|
|
6154
|
-
".rs",
|
|
6155
|
-
".java",
|
|
6156
|
-
".cpp",
|
|
6157
|
-
".c",
|
|
6158
|
-
".rb",
|
|
6159
|
-
".swift",
|
|
6160
|
-
".kt",
|
|
6161
|
-
".cs",
|
|
6162
|
-
".scala",
|
|
6163
|
-
".php",
|
|
6164
|
-
".cc",
|
|
6165
|
-
".cxx",
|
|
6166
|
-
".hpp",
|
|
6167
|
-
".h",
|
|
6168
|
-
".kts",
|
|
6169
|
-
".md",
|
|
6170
|
-
".txt",
|
|
6171
|
-
".rst",
|
|
6172
|
-
".pdf",
|
|
6173
|
-
".png",
|
|
6174
|
-
".jpg",
|
|
6175
|
-
".jpeg",
|
|
6176
|
-
".webp",
|
|
6177
|
-
".gif",
|
|
6178
|
-
".svg"
|
|
6179
|
-
]);
|
|
6180
|
-
var CODE_EXTENSIONS3 = /* @__PURE__ */ new Set([
|
|
6181
|
-
".py",
|
|
6182
|
-
".ts",
|
|
6183
|
-
".js",
|
|
6184
|
-
".go",
|
|
6185
|
-
".rs",
|
|
6186
|
-
".java",
|
|
6187
|
-
".cpp",
|
|
6188
|
-
".c",
|
|
6189
|
-
".rb",
|
|
6190
|
-
".swift",
|
|
6191
|
-
".kt",
|
|
6192
|
-
".cs",
|
|
6193
|
-
".scala",
|
|
6194
|
-
".php",
|
|
6195
|
-
".cc",
|
|
6196
|
-
".cxx",
|
|
6197
|
-
".hpp",
|
|
6198
|
-
".h",
|
|
6199
|
-
".kts"
|
|
6663
|
+
...CODE_EXTENSIONS,
|
|
6664
|
+
...DOC_EXTENSIONS,
|
|
6665
|
+
...PAPER_EXTENSIONS,
|
|
6666
|
+
...IMAGE_EXTENSIONS
|
|
6200
6667
|
]);
|
|
6201
6668
|
async function rebuildCode(watchPath, followSymlinks = false) {
|
|
6202
6669
|
try {
|
|
@@ -6248,8 +6715,8 @@ async function rebuildCode(watchPath, followSymlinks = false) {
|
|
|
6248
6715
|
labels.set(cid, `Community ${cid}`);
|
|
6249
6716
|
}
|
|
6250
6717
|
const questions = suggestQuestions2(G, communities, labels);
|
|
6251
|
-
const outDir = (0,
|
|
6252
|
-
(0,
|
|
6718
|
+
const outDir = (0, import_node_path9.resolve)(watchPath, "graphify-out");
|
|
6719
|
+
(0, import_node_fs11.mkdirSync)(outDir, { recursive: true });
|
|
6253
6720
|
const report = generate2(
|
|
6254
6721
|
G,
|
|
6255
6722
|
communities,
|
|
@@ -6262,11 +6729,11 @@ async function rebuildCode(watchPath, followSymlinks = false) {
|
|
|
6262
6729
|
watchPath,
|
|
6263
6730
|
questions
|
|
6264
6731
|
);
|
|
6265
|
-
(0,
|
|
6266
|
-
toJson2(G, communities, (0,
|
|
6267
|
-
const flagPath = (0,
|
|
6268
|
-
if ((0,
|
|
6269
|
-
(0,
|
|
6732
|
+
(0, import_node_fs11.writeFileSync)((0, import_node_path9.resolve)(outDir, "GRAPH_REPORT.md"), report, "utf-8");
|
|
6733
|
+
toJson2(G, communities, (0, import_node_path9.resolve)(outDir, "graph.json"), { communityLabels: labels });
|
|
6734
|
+
const flagPath = (0, import_node_path9.resolve)(outDir, "needs_update");
|
|
6735
|
+
if ((0, import_node_fs11.existsSync)(flagPath)) {
|
|
6736
|
+
(0, import_node_fs11.unlinkSync)(flagPath);
|
|
6270
6737
|
}
|
|
6271
6738
|
console.log(
|
|
6272
6739
|
`[graphify watch] Rebuilt: ${G.order} nodes, ${G.size} edges, ${communities.size} communities`
|
|
@@ -6283,10 +6750,10 @@ async function rebuildCode(watchPath, followSymlinks = false) {
|
|
|
6283
6750
|
}
|
|
6284
6751
|
}
|
|
6285
6752
|
function notifyOnly(watchPath) {
|
|
6286
|
-
const outDir = (0,
|
|
6287
|
-
(0,
|
|
6288
|
-
const flagPath = (0,
|
|
6289
|
-
(0,
|
|
6753
|
+
const outDir = (0, import_node_path9.resolve)(watchPath, "graphify-out");
|
|
6754
|
+
(0, import_node_fs11.mkdirSync)(outDir, { recursive: true });
|
|
6755
|
+
const flagPath = (0, import_node_path9.resolve)(outDir, "needs_update");
|
|
6756
|
+
(0, import_node_fs11.writeFileSync)(flagPath, "1", "utf-8");
|
|
6290
6757
|
console.log(`
|
|
6291
6758
|
[graphify watch] New or changed files detected in ${watchPath}`);
|
|
6292
6759
|
console.log(
|
|
@@ -6298,7 +6765,7 @@ function notifyOnly(watchPath) {
|
|
|
6298
6765
|
console.log(`[graphify watch] Flag written to ${flagPath}`);
|
|
6299
6766
|
}
|
|
6300
6767
|
function hasNonCode(changedPaths) {
|
|
6301
|
-
return changedPaths.some((p) => !
|
|
6768
|
+
return changedPaths.some((p) => !CODE_EXTENSIONS.has((0, import_node_path9.extname)(p).toLowerCase()));
|
|
6302
6769
|
}
|
|
6303
6770
|
async function watch(watchPath, debounce = 3) {
|
|
6304
6771
|
let chokidar;
|
|
@@ -6307,7 +6774,7 @@ async function watch(watchPath, debounce = 3) {
|
|
|
6307
6774
|
} catch {
|
|
6308
6775
|
throw new Error("chokidar not installed. Run: npm install chokidar");
|
|
6309
6776
|
}
|
|
6310
|
-
const resolvedPath = (0,
|
|
6777
|
+
const resolvedPath = (0, import_node_path9.resolve)(watchPath);
|
|
6311
6778
|
let lastTrigger = 0;
|
|
6312
6779
|
let pending = false;
|
|
6313
6780
|
const changed = /* @__PURE__ */ new Set();
|
|
@@ -6322,7 +6789,7 @@ async function watch(watchPath, debounce = 3) {
|
|
|
6322
6789
|
]
|
|
6323
6790
|
});
|
|
6324
6791
|
watcher.on("all", (_event, filePath) => {
|
|
6325
|
-
const ext = (0,
|
|
6792
|
+
const ext = (0, import_node_path9.extname)(filePath).toLowerCase();
|
|
6326
6793
|
if (!WATCHED_EXTENSIONS.has(ext)) return;
|
|
6327
6794
|
const parts = filePath.split("/");
|
|
6328
6795
|
if (parts.some((part) => part.startsWith(".") && part !== ".")) return;
|
|
@@ -6361,7 +6828,7 @@ async function watch(watchPath, debounce = 3) {
|
|
|
6361
6828
|
process.on("SIGINT", cleanup);
|
|
6362
6829
|
process.on("SIGTERM", cleanup);
|
|
6363
6830
|
}
|
|
6364
|
-
var isDirectExecution3 = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^watch\.(?:js|mjs|cjs|ts)$/.test((0,
|
|
6831
|
+
var isDirectExecution3 = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^watch\.(?:js|mjs|cjs|ts)$/.test((0, import_node_path9.basename)(process.argv[1]));
|
|
6365
6832
|
if (isDirectExecution3) {
|
|
6366
6833
|
const watchPath = process.argv[2] ?? ".";
|
|
6367
6834
|
const debounce = process.argv[3] ? parseFloat(process.argv[3]) : 3;
|
|
@@ -6374,8 +6841,10 @@ if (isDirectExecution3) {
|
|
|
6374
6841
|
0 && (module.exports = {
|
|
6375
6842
|
FileType,
|
|
6376
6843
|
assertValid,
|
|
6844
|
+
augmentDetectionWithTranscripts,
|
|
6377
6845
|
build,
|
|
6378
6846
|
buildFromJson,
|
|
6847
|
+
buildWhisperPrompt,
|
|
6379
6848
|
checkSemanticCache,
|
|
6380
6849
|
classifyFile,
|
|
6381
6850
|
cluster,
|
|
@@ -6383,6 +6852,7 @@ if (isDirectExecution3) {
|
|
|
6383
6852
|
collectFiles,
|
|
6384
6853
|
detect,
|
|
6385
6854
|
detectIncremental,
|
|
6855
|
+
downloadAudio,
|
|
6386
6856
|
extract,
|
|
6387
6857
|
fileHash,
|
|
6388
6858
|
generateReport,
|
|
@@ -6412,6 +6882,8 @@ if (isDirectExecution3) {
|
|
|
6412
6882
|
toJson,
|
|
6413
6883
|
toSvg,
|
|
6414
6884
|
toWiki,
|
|
6885
|
+
transcribe,
|
|
6886
|
+
transcribeAll,
|
|
6415
6887
|
validateExtraction,
|
|
6416
6888
|
validateGraphPath,
|
|
6417
6889
|
validateUrl,
|