graphifyy 0.3.17 → 0.3.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -29,6 +29,21 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
29
29
  ));
30
30
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
31
31
 
32
+ // src/types.ts
33
+ var FileType;
34
+ var init_types = __esm({
35
+ "src/types.ts"() {
36
+ FileType = /* @__PURE__ */ ((FileType2) => {
37
+ FileType2["CODE"] = "code";
38
+ FileType2["DOCUMENT"] = "document";
39
+ FileType2["PAPER"] = "paper";
40
+ FileType2["IMAGE"] = "image";
41
+ FileType2["VIDEO"] = "video";
42
+ return FileType2;
43
+ })(FileType || {});
44
+ }
45
+ });
46
+
32
47
  // src/validate.ts
33
48
  function validateExtraction(data) {
34
49
  if (typeof data !== "object" || data === null || Array.isArray(data)) {
@@ -118,13 +133,81 @@ var init_validate = __esm({
118
133
  }
119
134
  });
120
135
 
136
+ // src/graph.ts
137
+ function createGraph(directed = false) {
138
+ return new import_graphology.default({ type: directed ? "directed" : "undirected", multi: false });
139
+ }
140
+ function isDirectedGraph(G) {
141
+ return G.type === "directed";
142
+ }
143
+ function loadGraphFromData(raw) {
144
+ const G = createGraph(raw.directed === true);
145
+ for (const [key, value] of Object.entries(raw.graph ?? {})) {
146
+ G.setAttribute(key, value);
147
+ }
148
+ for (const node of raw.nodes ?? []) {
149
+ const { id, ...attrs } = node;
150
+ G.mergeNode(id, attrs);
151
+ }
152
+ for (const link of raw.links ?? raw.edges ?? []) {
153
+ const { source, target, ...attrs } = link;
154
+ if (!G.hasNode(source) || !G.hasNode(target)) continue;
155
+ try {
156
+ G.mergeEdge(source, target, attrs);
157
+ } catch {
158
+ }
159
+ }
160
+ if (raw.hyperedges && raw.hyperedges.length > 0) {
161
+ G.setAttribute("hyperedges", raw.hyperedges);
162
+ }
163
+ return G;
164
+ }
165
+ function toUndirectedGraph(G) {
166
+ if (!isDirectedGraph(G)) return G.copy();
167
+ const copy = createGraph(false);
168
+ for (const [key, value] of Object.entries(G.getAttributes())) {
169
+ copy.setAttribute(key, value);
170
+ }
171
+ G.forEachNode((nodeId, attrs) => {
172
+ copy.mergeNode(nodeId, attrs);
173
+ });
174
+ G.forEachEdge((_edge, attrs, source, target) => {
175
+ if (!copy.hasNode(source) || !copy.hasNode(target)) return;
176
+ try {
177
+ copy.mergeEdge(source, target, attrs);
178
+ } catch {
179
+ }
180
+ });
181
+ return copy;
182
+ }
183
+ function forEachTraversalNeighbor(G, node, callback) {
184
+ if (isDirectedGraph(G)) {
185
+ G.forEachOutboundNeighbor(node, callback);
186
+ return;
187
+ }
188
+ G.forEachNeighbor(node, callback);
189
+ }
190
+ function traversalNeighbors(G, node) {
191
+ const neighbors = [];
192
+ forEachTraversalNeighbor(G, node, (neighbor) => {
193
+ neighbors.push(neighbor);
194
+ });
195
+ return neighbors;
196
+ }
197
+ var import_graphology;
198
+ var init_graph = __esm({
199
+ "src/graph.ts"() {
200
+ import_graphology = __toESM(require("graphology"), 1);
201
+ }
202
+ });
203
+
121
204
  // src/build.ts
122
205
  var build_exports = {};
123
206
  __export(build_exports, {
124
207
  build: () => build,
125
208
  buildFromJson: () => buildFromJson
126
209
  });
127
- function buildFromJson(extraction) {
210
+ function buildFromJson(extraction, options) {
128
211
  const errors = validateExtraction(extraction);
129
212
  const realErrors = errors.filter((e) => !e.includes("does not match any node id"));
130
213
  if (realErrors.length > 0) {
@@ -132,7 +215,7 @@ function buildFromJson(extraction) {
132
215
  `[graphify] Extraction warning (${realErrors.length} issues): ${realErrors[0]}`
133
216
  );
134
217
  }
135
- const G = new import_graphology.default({ type: "undirected", multi: false });
218
+ const G = createGraph(options?.directed === true);
136
219
  for (const node of extraction.nodes ?? []) {
137
220
  const { id, ...attrs } = node;
138
221
  G.mergeNode(id, attrs);
@@ -154,7 +237,7 @@ function buildFromJson(extraction) {
154
237
  }
155
238
  return G;
156
239
  }
157
- function build(extractions) {
240
+ function build(extractions, options) {
158
241
  const combined = {
159
242
  nodes: [],
160
243
  edges: [],
@@ -169,12 +252,11 @@ function build(extractions) {
169
252
  combined.input_tokens += ext.input_tokens ?? 0;
170
253
  combined.output_tokens += ext.output_tokens ?? 0;
171
254
  }
172
- return buildFromJson(combined);
255
+ return buildFromJson(combined, options);
173
256
  }
174
- var import_graphology;
175
257
  var init_build = __esm({
176
258
  "src/build.ts"() {
177
- import_graphology = __toESM(require("graphology"), 1);
259
+ init_graph();
178
260
  init_validate();
179
261
  }
180
262
  });
@@ -214,7 +296,7 @@ __export(cluster_exports, {
214
296
  scoreAll: () => scoreAll
215
297
  });
216
298
  function partition(G) {
217
- const result = (0, import_graphology_communities_louvain.default)(G);
299
+ const result = (0, import_graphology_communities_louvain.default)(G.type === "directed" ? toUndirectedGraph(G) : G);
218
300
  const map = /* @__PURE__ */ new Map();
219
301
  for (const [node, cid] of Object.entries(result)) {
220
302
  map.set(node, cid);
@@ -321,11 +403,370 @@ var init_cluster = __esm({
321
403
  "src/cluster.ts"() {
322
404
  import_graphology_communities_louvain = __toESM(require("graphology-communities-louvain"), 1);
323
405
  init_collections();
406
+ init_graph();
324
407
  MAX_COMMUNITY_FRACTION = 0.25;
325
408
  MIN_SPLIT_SIZE = 10;
326
409
  }
327
410
  });
328
411
 
412
+ // src/detect.ts
413
+ function isSensitive(filePath) {
414
+ const name = (0, import_node_path.basename)(filePath);
415
+ return SENSITIVE_PATTERNS.some((p) => p.test(name) || p.test(filePath));
416
+ }
417
+ function looksLikePaper(filePath) {
418
+ try {
419
+ const text = (0, import_node_fs.readFileSync)(filePath, "utf-8").slice(0, 3e3);
420
+ const hits = PAPER_SIGNALS.filter((p) => p.test(text)).length;
421
+ return hits >= PAPER_SIGNAL_THRESHOLD;
422
+ } catch {
423
+ return false;
424
+ }
425
+ }
426
+ function classifyFile(filePath) {
427
+ const ext = (0, import_node_path.extname)(filePath).toLowerCase();
428
+ if (CODE_EXTENSIONS.has(ext)) return "code" /* CODE */;
429
+ if (PAPER_EXTENSIONS.has(ext)) {
430
+ const parts = filePath.split(import_node_path.sep);
431
+ if (parts.some((p) => [...ASSET_DIR_MARKERS].some((m) => p.endsWith(m)))) return null;
432
+ return "paper" /* PAPER */;
433
+ }
434
+ if (IMAGE_EXTENSIONS.has(ext)) return "image" /* IMAGE */;
435
+ if (VIDEO_EXTENSIONS.has(ext)) return "video" /* VIDEO */;
436
+ if (DOC_EXTENSIONS.has(ext)) {
437
+ if (looksLikePaper(filePath)) return "paper" /* PAPER */;
438
+ return "document" /* DOCUMENT */;
439
+ }
440
+ if (OFFICE_EXTENSIONS.has(ext)) return "document" /* DOCUMENT */;
441
+ return null;
442
+ }
443
+ function countWords(filePath) {
444
+ try {
445
+ const text = (0, import_node_fs.readFileSync)(filePath, "utf-8");
446
+ return text.split(/\s+/).filter(Boolean).length;
447
+ } catch {
448
+ return 0;
449
+ }
450
+ }
451
+ function isNoiseDir(part) {
452
+ if (SKIP_DIRS.has(part)) return true;
453
+ if (part.endsWith("_venv") || part.endsWith("_env")) return true;
454
+ if (part.endsWith(".egg-info")) return true;
455
+ return false;
456
+ }
457
+ function loadGraphifyignore(root) {
458
+ const patterns = [];
459
+ let current = (0, import_node_path.resolve)(root);
460
+ while (true) {
461
+ const ignoreFile = (0, import_node_path.join)(current, ".graphifyignore");
462
+ if ((0, import_node_fs.existsSync)(ignoreFile)) {
463
+ for (let line of (0, import_node_fs.readFileSync)(ignoreFile, "utf-8").split("\n")) {
464
+ line = line.trim();
465
+ if (line && !line.startsWith("#")) {
466
+ patterns.push(line);
467
+ }
468
+ }
469
+ }
470
+ if ((0, import_node_fs.existsSync)((0, import_node_path.join)(current, ".git"))) {
471
+ break;
472
+ }
473
+ const parent = (0, import_node_path.dirname)(current);
474
+ if (parent === current) {
475
+ break;
476
+ }
477
+ current = parent;
478
+ }
479
+ return patterns;
480
+ }
481
+ function matchGlob(text, pattern) {
482
+ const regex = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
483
+ return new RegExp(`^${regex}$`).test(text);
484
+ }
485
+ function isIgnored(filePath, root, patterns) {
486
+ if (patterns.length === 0) return false;
487
+ let rel;
488
+ try {
489
+ rel = (0, import_node_path.relative)(root, filePath).replace(/\\/g, "/");
490
+ } catch {
491
+ return false;
492
+ }
493
+ const parts = rel.split("/");
494
+ for (const pattern of patterns) {
495
+ const p = pattern.replace(/^\/+|\/+$/g, "");
496
+ if (!p) continue;
497
+ if (matchGlob(rel, p)) return true;
498
+ if (matchGlob((0, import_node_path.basename)(filePath), p)) return true;
499
+ for (let i = 0; i < parts.length; i++) {
500
+ if (matchGlob(parts[i], p)) return true;
501
+ if (matchGlob(parts.slice(0, i + 1).join("/"), p)) return true;
502
+ }
503
+ }
504
+ return false;
505
+ }
506
+ function walkDir(dir, root, ignorePatterns, followSymlinks, skipPrune) {
507
+ const result = [];
508
+ let entries;
509
+ try {
510
+ entries = (0, import_node_fs.readdirSync)(dir);
511
+ } catch {
512
+ return result;
513
+ }
514
+ for (const entry of entries) {
515
+ const full = (0, import_node_path.join)(dir, entry);
516
+ let stat;
517
+ try {
518
+ stat = followSymlinks ? (0, import_node_fs.statSync)(full) : (0, import_node_fs.lstatSync)(full);
519
+ } catch {
520
+ continue;
521
+ }
522
+ if (stat.isDirectory()) {
523
+ if (!skipPrune) {
524
+ if (entry.startsWith(".")) continue;
525
+ if (isNoiseDir(entry)) continue;
526
+ if (isIgnored(full, root, ignorePatterns)) continue;
527
+ }
528
+ result.push(...walkDir(full, root, ignorePatterns, followSymlinks, skipPrune));
529
+ } else if (stat.isFile()) {
530
+ result.push(full);
531
+ }
532
+ }
533
+ return result;
534
+ }
535
+ function detect(root, options) {
536
+ const followSymlinks = options?.followSymlinks ?? false;
537
+ const rootResolved = (0, import_node_path.resolve)(root);
538
+ const ignorePatterns = loadGraphifyignore(rootResolved);
539
+ const convertedDir = (0, import_node_path.join)(rootResolved, "graphify-out", "converted");
540
+ const memoryDir = (0, import_node_path.join)(rootResolved, "graphify-out", "memory");
541
+ const files = {
542
+ code: [],
543
+ document: [],
544
+ paper: [],
545
+ image: [],
546
+ video: []
547
+ };
548
+ let totalWords = 0;
549
+ const skippedSensitive = [];
550
+ const allFiles = walkDir(rootResolved, rootResolved, ignorePatterns, followSymlinks, false);
551
+ if ((0, import_node_fs.existsSync)(memoryDir)) {
552
+ allFiles.push(...walkDir(memoryDir, rootResolved, ignorePatterns, followSymlinks, true));
553
+ }
554
+ const seen = /* @__PURE__ */ new Set();
555
+ for (const p of allFiles) {
556
+ if (seen.has(p)) continue;
557
+ seen.add(p);
558
+ const inMemory = (0, import_node_fs.existsSync)(memoryDir) && p.startsWith(memoryDir);
559
+ if (!inMemory) {
560
+ if ((0, import_node_path.basename)(p).startsWith(".")) continue;
561
+ if (p.startsWith(convertedDir)) continue;
562
+ }
563
+ if (isIgnored(p, rootResolved, ignorePatterns)) continue;
564
+ if (isSensitive(p)) {
565
+ skippedSensitive.push(p);
566
+ continue;
567
+ }
568
+ const ftype = classifyFile(p);
569
+ if (!ftype) continue;
570
+ if (OFFICE_EXTENSIONS.has((0, import_node_path.extname)(p).toLowerCase())) {
571
+ skippedSensitive.push(p + " [office conversion requires async - use pipeline]");
572
+ continue;
573
+ }
574
+ files[ftype].push(p);
575
+ if (ftype !== "video" /* VIDEO */) {
576
+ totalWords += countWords(p);
577
+ }
578
+ }
579
+ const totalFiles = Object.values(files).reduce((s, v) => s + v.length, 0);
580
+ const needsGraph = totalWords >= CORPUS_WARN_THRESHOLD;
581
+ let warning = null;
582
+ if (!needsGraph) {
583
+ warning = `Corpus is ~${totalWords.toLocaleString()} words - fits in a single context window. You may not need a graph.`;
584
+ } else if (totalWords >= CORPUS_UPPER_THRESHOLD || totalFiles >= FILE_COUNT_UPPER) {
585
+ warning = `Large corpus: ${totalFiles} files \xB7 ~${totalWords.toLocaleString()} words. Semantic extraction will be expensive (many Claude tokens). Consider running on a subfolder, or use --no-semantic to run AST-only.`;
586
+ }
587
+ return {
588
+ files,
589
+ total_files: totalFiles,
590
+ total_words: totalWords,
591
+ needs_graph: needsGraph,
592
+ warning,
593
+ skipped_sensitive: skippedSensitive,
594
+ graphifyignore_patterns: ignorePatterns.length
595
+ };
596
+ }
597
+ function loadManifest(manifestPath = MANIFEST_PATH) {
598
+ try {
599
+ return JSON.parse((0, import_node_fs.readFileSync)(manifestPath, "utf-8"));
600
+ } catch {
601
+ return {};
602
+ }
603
+ }
604
+ function saveManifest(files, manifestPath = MANIFEST_PATH) {
605
+ const manifest = {};
606
+ for (const fileList of Object.values(files)) {
607
+ for (const f of fileList) {
608
+ try {
609
+ manifest[f] = (0, import_node_fs.statSync)(f).mtimeMs;
610
+ } catch {
611
+ }
612
+ }
613
+ }
614
+ const dir = (0, import_node_path.join)(manifestPath, "..");
615
+ (0, import_node_fs.mkdirSync)(dir, { recursive: true });
616
+ (0, import_node_fs.writeFileSync)(manifestPath, JSON.stringify(manifest, null, 2));
617
+ }
618
+ function detectIncremental(root, manifestPath = MANIFEST_PATH) {
619
+ const full = detect(root);
620
+ const manifest = loadManifest(manifestPath);
621
+ if (Object.keys(manifest).length === 0) {
622
+ return {
623
+ ...full,
624
+ incremental: true,
625
+ new_files: full.files,
626
+ unchanged_files: Object.fromEntries(Object.keys(full.files).map((k) => [k, []])),
627
+ new_total: full.total_files
628
+ };
629
+ }
630
+ const newFiles = {};
631
+ const unchangedFiles = {};
632
+ for (const k of Object.keys(full.files)) {
633
+ newFiles[k] = [];
634
+ unchangedFiles[k] = [];
635
+ }
636
+ for (const [ftype, fileList] of Object.entries(full.files)) {
637
+ for (const f of fileList) {
638
+ const storedMtime = manifest[f];
639
+ let currentMtime = 0;
640
+ try {
641
+ currentMtime = (0, import_node_fs.statSync)(f).mtimeMs;
642
+ } catch {
643
+ }
644
+ if (storedMtime === void 0 || currentMtime > storedMtime) {
645
+ newFiles[ftype].push(f);
646
+ } else {
647
+ unchangedFiles[ftype].push(f);
648
+ }
649
+ }
650
+ }
651
+ const currentFiles = new Set(Object.values(full.files).flat());
652
+ const deletedFiles = Object.keys(manifest).filter((f) => !currentFiles.has(f));
653
+ const newTotal = Object.values(newFiles).reduce((s, v) => s + v.length, 0);
654
+ return {
655
+ ...full,
656
+ incremental: true,
657
+ new_files: newFiles,
658
+ unchanged_files: unchangedFiles,
659
+ new_total: newTotal,
660
+ deleted_files: deletedFiles
661
+ };
662
+ }
663
+ var import_node_fs, import_node_path, import_node_crypto, MANIFEST_PATH, CODE_EXTENSIONS, DOC_EXTENSIONS, PAPER_EXTENSIONS, IMAGE_EXTENSIONS, OFFICE_EXTENSIONS, VIDEO_EXTENSIONS, CORPUS_WARN_THRESHOLD, CORPUS_UPPER_THRESHOLD, FILE_COUNT_UPPER, SENSITIVE_PATTERNS, PAPER_SIGNALS, PAPER_SIGNAL_THRESHOLD, ASSET_DIR_MARKERS, SKIP_DIRS;
664
+ var init_detect = __esm({
665
+ "src/detect.ts"() {
666
+ import_node_fs = require("fs");
667
+ import_node_path = require("path");
668
+ import_node_crypto = require("crypto");
669
+ init_types();
670
+ MANIFEST_PATH = "graphify-out/manifest.json";
671
+ CODE_EXTENSIONS = /* @__PURE__ */ new Set([
672
+ ".py",
673
+ ".ts",
674
+ ".js",
675
+ ".jsx",
676
+ ".tsx",
677
+ ".go",
678
+ ".rs",
679
+ ".java",
680
+ ".cpp",
681
+ ".cc",
682
+ ".cxx",
683
+ ".c",
684
+ ".h",
685
+ ".hpp",
686
+ ".rb",
687
+ ".swift",
688
+ ".kt",
689
+ ".kts",
690
+ ".cs",
691
+ ".scala",
692
+ ".php",
693
+ ".lua",
694
+ ".toc",
695
+ ".zig",
696
+ ".ps1",
697
+ ".ex",
698
+ ".exs",
699
+ ".m",
700
+ ".mm",
701
+ ".jl"
702
+ ]);
703
+ DOC_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt", ".rst"]);
704
+ PAPER_EXTENSIONS = /* @__PURE__ */ new Set([".pdf"]);
705
+ IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg"]);
706
+ OFFICE_EXTENSIONS = /* @__PURE__ */ new Set([".docx", ".xlsx"]);
707
+ VIDEO_EXTENSIONS = /* @__PURE__ */ new Set([
708
+ ".mp4",
709
+ ".mov",
710
+ ".webm",
711
+ ".mkv",
712
+ ".avi",
713
+ ".m4v",
714
+ ".mp3",
715
+ ".wav",
716
+ ".m4a",
717
+ ".ogg"
718
+ ]);
719
+ CORPUS_WARN_THRESHOLD = 5e4;
720
+ CORPUS_UPPER_THRESHOLD = 5e5;
721
+ FILE_COUNT_UPPER = 200;
722
+ SENSITIVE_PATTERNS = [
723
+ /(^|[\\/])\.(env|envrc)(\.|$)/i,
724
+ /\.(pem|key|p12|pfx|cert|crt|der|p8)$/i,
725
+ /(credential|secret|passwd|password|token|private_key)/i,
726
+ /(id_rsa|id_dsa|id_ecdsa|id_ed25519)(\.pub)?$/,
727
+ /(\.netrc|\.pgpass|\.htpasswd)$/i,
728
+ /(aws_credentials|gcloud_credentials|service.account)/i
729
+ ];
730
+ PAPER_SIGNALS = [
731
+ /\barxiv\b/i,
732
+ /\bdoi\s*:/i,
733
+ /\babstract\b/i,
734
+ /\bproceedings\b/i,
735
+ /\bjournal\b/i,
736
+ /\bpreprint\b/i,
737
+ /\\cite\{/,
738
+ /\[\d+\]/,
739
+ /\[\n\d+\n\]/,
740
+ /eq\.\s*\d+|equation\s+\d+/i,
741
+ /\d{4}\.\d{4,5}/,
742
+ /\bwe propose\b/i,
743
+ /\bliterature\b/i
744
+ ];
745
+ PAPER_SIGNAL_THRESHOLD = 3;
746
+ ASSET_DIR_MARKERS = /* @__PURE__ */ new Set([".imageset", ".xcassets", ".appiconset", ".colorset", ".launchimage"]);
747
+ SKIP_DIRS = /* @__PURE__ */ new Set([
748
+ "venv",
749
+ ".venv",
750
+ "env",
751
+ ".env",
752
+ "node_modules",
753
+ "__pycache__",
754
+ ".git",
755
+ "dist",
756
+ "build",
757
+ "target",
758
+ "out",
759
+ "site-packages",
760
+ "lib64",
761
+ ".pytest_cache",
762
+ ".mypy_cache",
763
+ ".ruff_cache",
764
+ ".tox",
765
+ ".eggs"
766
+ ]);
767
+ }
768
+ });
769
+
329
770
  // src/analyze.ts
330
771
  var analyze_exports = {};
331
772
  __export(analyze_exports, {
@@ -366,10 +807,11 @@ function isConceptNode(G, nodeId) {
366
807
  return false;
367
808
  }
368
809
  function fileCategory(path) {
369
- const ext = path.includes(".") ? path.split(".").pop()?.toLowerCase() ?? "" : "";
810
+ const ext = path.includes(".") ? `.${path.split(".").pop()?.toLowerCase() ?? ""}` : "";
370
811
  if (CODE_EXTENSIONS.has(ext)) return "code";
371
812
  if (PAPER_EXTENSIONS.has(ext)) return "paper";
372
813
  if (IMAGE_EXTENSIONS.has(ext)) return "image";
814
+ if (DOC_EXTENSIONS.has(ext)) return "doc";
373
815
  return "doc";
374
816
  }
375
817
  function topLevelDir(path) {
@@ -563,10 +1005,10 @@ function suggestQuestions(G, communities, communityLabels, topN = 7) {
563
1005
  const cid = nodeCommunity.get(nodeId);
564
1006
  const commLabel = cid !== void 0 ? labelMap.get(cid) ?? `Community ${cid}` : "unknown";
565
1007
  const neighborComms = /* @__PURE__ */ new Set();
566
- G.forEachNeighbor(nodeId, (n) => {
1008
+ for (const n of traversalNeighbors(G, nodeId)) {
567
1009
  const nc = nodeCommunity.get(n);
568
1010
  if (nc !== void 0 && nc !== cid) neighborComms.add(nc);
569
- });
1011
+ }
570
1012
  if (neighborComms.size > 0) {
571
1013
  const otherLabels = [...neighborComms].map((c) => labelMap.get(c) ?? `Community ${c}`);
572
1014
  questions.push({
@@ -695,31 +1137,14 @@ function graphDiff(GOld, GNew) {
695
1137
  summary: parts.length > 0 ? parts.join(", ") : "no changes"
696
1138
  };
697
1139
  }
698
- var import_betweenness, CODE_EXTENSIONS, PAPER_EXTENSIONS, IMAGE_EXTENSIONS;
1140
+ var import_betweenness;
699
1141
  var init_analyze = __esm({
700
1142
  "src/analyze.ts"() {
701
1143
  import_betweenness = __toESM(require("graphology-metrics/centrality/betweenness.js"), 1);
702
1144
  init_collections();
703
1145
  init_cluster();
704
- CODE_EXTENSIONS = /* @__PURE__ */ new Set([
705
- "py",
706
- "ts",
707
- "tsx",
708
- "js",
709
- "go",
710
- "rs",
711
- "java",
712
- "rb",
713
- "cpp",
714
- "c",
715
- "h",
716
- "cs",
717
- "kt",
718
- "scala",
719
- "php"
720
- ]);
721
- PAPER_EXTENSIONS = /* @__PURE__ */ new Set(["pdf"]);
722
- IMAGE_EXTENSIONS = /* @__PURE__ */ new Set(["png", "jpg", "jpeg", "webp", "gif", "svg"]);
1146
+ init_graph();
1147
+ init_detect();
723
1148
  }
724
1149
  });
725
1150
 
@@ -983,19 +1408,19 @@ async function safeFetchText(url, maxBytes = MAX_TEXT_BYTES, timeout = 15e3) {
983
1408
  return raw.toString("utf-8");
984
1409
  }
985
1410
  function validateGraphPath(filePath, base) {
986
- const resolvedBase = (0, import_node_path.resolve)(base ?? "graphify-out");
987
- if (!(0, import_node_fs.existsSync)(resolvedBase)) {
1411
+ const resolvedBase = (0, import_node_path2.resolve)(base ?? "graphify-out");
1412
+ if (!(0, import_node_fs2.existsSync)(resolvedBase)) {
988
1413
  throw new Error(
989
1414
  `Graph base directory does not exist: ${resolvedBase}. Run the graphify skill first to build the graph (for Codex: $graphify .).`
990
1415
  );
991
1416
  }
992
- const resolved = (0, import_node_path.resolve)(filePath);
1417
+ const resolved = (0, import_node_path2.resolve)(filePath);
993
1418
  if (!resolved.startsWith(resolvedBase + "/") && resolved !== resolvedBase) {
994
1419
  throw new Error(
995
1420
  `Path '${filePath}' escapes the allowed directory ${resolvedBase}. Only paths inside graphify-out/ are permitted.`
996
1421
  );
997
1422
  }
998
- if (!(0, import_node_fs.existsSync)(resolved)) {
1423
+ if (!(0, import_node_fs2.existsSync)(resolved)) {
999
1424
  throw new Error(`Graph file not found: ${resolved}`);
1000
1425
  }
1001
1426
  return resolved;
@@ -1010,11 +1435,11 @@ function sanitizeLabel(text) {
1010
1435
  function escapeHtml(text) {
1011
1436
  return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#39;");
1012
1437
  }
1013
- var import_node_path, import_node_fs, import_node_url, dns, net, ALLOWED_SCHEMES, MAX_FETCH_BYTES, MAX_TEXT_BYTES, BLOCKED_HOSTS, CONTROL_CHAR_RE, MAX_LABEL_LEN;
1438
+ var import_node_path2, import_node_fs2, import_node_url, dns, net, ALLOWED_SCHEMES, MAX_FETCH_BYTES, MAX_TEXT_BYTES, BLOCKED_HOSTS, CONTROL_CHAR_RE, MAX_LABEL_LEN;
1014
1439
  var init_security = __esm({
1015
1440
  "src/security.ts"() {
1016
- import_node_path = require("path");
1017
- import_node_fs = require("fs");
1441
+ import_node_path2 = require("path");
1442
+ import_node_fs2 = require("fs");
1018
1443
  import_node_url = require("url");
1019
1444
  dns = __toESM(require("dns/promises"), 1);
1020
1445
  net = __toESM(require("net"), 1);
@@ -1065,14 +1490,17 @@ function normalizeCommunityLabels(labelsOrOptions) {
1065
1490
  }
1066
1491
  return toNumericMap(labelsOrOptions.communityLabels);
1067
1492
  }
1068
- function toJson(G, communities, outputPath) {
1493
+ function toJson(G, communities, outputPath, communityLabelsOrOptions) {
1069
1494
  const nodeComm = nodeCommunityMap2(communities);
1495
+ const communityLabels = normalizeCommunityLabels(communityLabelsOrOptions);
1070
1496
  const nodes = [];
1071
1497
  G.forEachNode((nodeId, attrs) => {
1498
+ const communityId = nodeComm.get(nodeId) ?? null;
1072
1499
  nodes.push({
1073
1500
  id: nodeId,
1074
1501
  ...attrs,
1075
- community: nodeComm.get(nodeId) ?? null
1502
+ community: communityId,
1503
+ community_name: communityId !== null ? sanitizeLabel(communityLabels?.get(communityId) ?? `Community ${communityId}`) : null
1076
1504
  });
1077
1505
  });
1078
1506
  const links = [];
@@ -1089,15 +1517,20 @@ function toJson(G, communities, outputPath) {
1089
1517
  links.push(link);
1090
1518
  });
1091
1519
  const hyperedges = G.getAttribute("hyperedges") ?? [];
1520
+ const communityLabelsObject = communityLabels ? Object.fromEntries(
1521
+ [...communityLabels.entries()].sort((a, b) => a[0] - b[0]).map(([cid, label]) => [String(cid), sanitizeLabel(label)])
1522
+ ) : {};
1092
1523
  const output = {
1093
- directed: false,
1524
+ directed: isDirectedGraph(G),
1094
1525
  multigraph: false,
1095
- graph: {},
1526
+ graph: {
1527
+ community_labels: communityLabelsObject
1528
+ },
1096
1529
  nodes,
1097
1530
  links,
1098
1531
  hyperedges
1099
1532
  };
1100
- (0, import_node_fs2.writeFileSync)(outputPath, JSON.stringify(output, null, 2), "utf-8");
1533
+ (0, import_node_fs3.writeFileSync)(outputPath, JSON.stringify(output, null, 2), "utf-8");
1101
1534
  }
1102
1535
  function toCypher(G, outputPath) {
1103
1536
  const lines = ["// Neo4j Cypher import - generated by the graphify skill", ""];
@@ -1119,7 +1552,7 @@ function toCypher(G, outputPath) {
1119
1552
  `MATCH (a {id: '${uEsc}'}), (b {id: '${vEsc}'}) MERGE (a)-[:${rel} {confidence: '${conf}'}]->(b);`
1120
1553
  );
1121
1554
  });
1122
- (0, import_node_fs2.writeFileSync)(outputPath, lines.join("\n"), "utf-8");
1555
+ (0, import_node_fs3.writeFileSync)(outputPath, lines.join("\n"), "utf-8");
1123
1556
  }
1124
1557
  function neo4jLabel(label) {
1125
1558
  const sanitized = label.replace(/[^A-Za-z0-9_]/g, "");
@@ -1358,9 +1791,24 @@ function focusNode(nodeId) {
1358
1791
  showInfo(nodeId);
1359
1792
  }
1360
1793
 
1794
+ let hoveredNodeId = null;
1795
+ network.on('hoverNode', params => {
1796
+ hoveredNodeId = params.node;
1797
+ container.style.cursor = 'pointer';
1798
+ });
1799
+ network.on('blurNode', () => {
1800
+ hoveredNodeId = null;
1801
+ container.style.cursor = 'default';
1802
+ });
1803
+ container.addEventListener('click', () => {
1804
+ if (hoveredNodeId !== null) {
1805
+ showInfo(hoveredNodeId);
1806
+ network.selectNodes([hoveredNodeId]);
1807
+ }
1808
+ });
1361
1809
  network.on('click', params => {
1362
1810
  if (params.nodes.length > 0) showInfo(params.nodes[0]);
1363
- else document.getElementById('info-content').innerHTML = '<span class="empty">Click a node to inspect it</span>';
1811
+ else if (hoveredNodeId === null) document.getElementById('info-content').innerHTML = '<span class="empty">Click a node to inspect it</span>';
1364
1812
  });
1365
1813
 
1366
1814
  const searchInput = document.getElementById('search');
@@ -1515,7 +1963,7 @@ ${htmlScript(nodesJson, edgesJson, legendJson)}
1515
1963
  ${hyperedgeScript(hyperedgesJson)}
1516
1964
  </body>
1517
1965
  </html>`;
1518
- (0, import_node_fs2.writeFileSync)(outputPath, html, "utf-8");
1966
+ (0, import_node_fs3.writeFileSync)(outputPath, html, "utf-8");
1519
1967
  }
1520
1968
  function toGraphml(G, communities, outputPath) {
1521
1969
  const nodeComm = nodeCommunityMap2(communities);
@@ -1531,7 +1979,7 @@ function toGraphml(G, communities, outputPath) {
1531
1979
  lines.push(' <key id="community" for="node" attr.name="community" attr.type="int"/>');
1532
1980
  lines.push(' <key id="relation" for="edge" attr.name="relation" attr.type="string"/>');
1533
1981
  lines.push(' <key id="confidence" for="edge" attr.name="confidence" attr.type="string"/>');
1534
- lines.push(' <graph id="G" edgedefault="undirected">');
1982
+ lines.push(` <graph id="G" edgedefault="${isDirectedGraph(G) ? "directed" : "undirected"}">`);
1535
1983
  G.forEachNode((nodeId, data) => {
1536
1984
  lines.push(` <node id="${xmlEsc(nodeId)}">`);
1537
1985
  lines.push(` <data key="label">${xmlEsc(data.label ?? nodeId)}</data>`);
@@ -1548,7 +1996,7 @@ function toGraphml(G, communities, outputPath) {
1548
1996
  });
1549
1997
  lines.push(" </graph>");
1550
1998
  lines.push("</graphml>");
1551
- (0, import_node_fs2.writeFileSync)(outputPath, lines.join("\n"), "utf-8");
1999
+ (0, import_node_fs3.writeFileSync)(outputPath, lines.join("\n"), "utf-8");
1552
2000
  }
1553
2001
  function toSvg(G, communities, outputPath, communityLabelsOrOptions, figsize = [20, 14]) {
1554
2002
  const communityMap = toNumericMap(communities);
@@ -1621,7 +2069,7 @@ function toSvg(G, communities, outputPath, communityLabelsOrOptions, figsize = [
1621
2069
  }
1622
2070
  }
1623
2071
  svgParts.push("</svg>");
1624
- (0, import_node_fs2.writeFileSync)(outputPath, svgParts.join("\n"), "utf-8");
2072
+ (0, import_node_fs3.writeFileSync)(outputPath, svgParts.join("\n"), "utf-8");
1625
2073
  }
1626
2074
  function toCanvas(G, communities, outputPath, communityLabelsOrOptions, nodeFilenames) {
1627
2075
  const communityMap = toNumericMap(communities);
@@ -1630,7 +2078,7 @@ function toCanvas(G, communities, outputPath, communityLabelsOrOptions, nodeFile
1630
2078
  const providedNodeFilenames = options?.nodeFilenames ?? nodeFilenames;
1631
2079
  const CANVAS_COLORS = ["1", "2", "3", "4", "5", "6"];
1632
2080
  function safeName(label) {
1633
- return label.replace(/[\\/*?:"<>|#^[\]]/g, "").trim() || "unnamed";
2081
+ return label.replace(/\r\n/g, " ").replace(/\r/g, " ").replace(/\n/g, " ").replace(/[\\/*?:"<>|#^[\]]/g, "").trim() || "unnamed";
1634
2082
  }
1635
2083
  let filenameMap;
1636
2084
  if (!providedNodeFilenames) {
@@ -1709,13 +2157,13 @@ function toCanvas(G, communities, outputPath, communityLabelsOrOptions, nodeFile
1709
2157
  for (let idx = 0; idx < sortedCids.length; idx++) {
1710
2158
  const cid = sortedCids[idx];
1711
2159
  const members = communityMap.get(cid) ?? [];
1712
- const communityName = communityLabels?.get(cid) ?? `Community ${cid}`;
2160
+ const communityName2 = communityLabels?.get(cid) ?? `Community ${cid}`;
1713
2161
  const [gx, gy, gw, gh] = groupLayout.get(cid) ?? [0, 0, 600, 400];
1714
2162
  const canvasColor = CANVAS_COLORS[idx % CANVAS_COLORS.length];
1715
2163
  canvasNodes.push({
1716
2164
  id: `g${cid}`,
1717
2165
  type: "group",
1718
- label: communityName,
2166
+ label: communityName2,
1719
2167
  x: gx,
1720
2168
  y: gy,
1721
2169
  width: gw,
@@ -1765,13 +2213,14 @@ function toCanvas(G, communities, outputPath, communityLabelsOrOptions, nodeFile
1765
2213
  });
1766
2214
  }
1767
2215
  const canvasData = { nodes: canvasNodes, edges: canvasEdges };
1768
- (0, import_node_fs2.writeFileSync)(outputPath, JSON.stringify(canvasData, null, 2), "utf-8");
2216
+ (0, import_node_fs3.writeFileSync)(outputPath, JSON.stringify(canvasData, null, 2), "utf-8");
1769
2217
  }
1770
- var import_node_fs2, COMMUNITY_COLORS, MAX_NODES_FOR_VIZ, CONFIDENCE_SCORE_DEFAULTS;
2218
+ var import_node_fs3, COMMUNITY_COLORS, MAX_NODES_FOR_VIZ, CONFIDENCE_SCORE_DEFAULTS;
1771
2219
  var init_export = __esm({
1772
2220
  "src/export.ts"() {
1773
- import_node_fs2 = require("fs");
2221
+ import_node_fs3 = require("fs");
1774
2222
  init_security();
2223
+ init_graph();
1775
2224
  init_collections();
1776
2225
  COMMUNITY_COLORS = [
1777
2226
  "#4E79A7",
@@ -1795,8 +2244,20 @@ var init_export = __esm({
1795
2244
  });
1796
2245
 
1797
2246
  // src/cache.ts
2247
+ function bodyContent(content) {
2248
+ const text = content.toString("utf-8");
2249
+ if (!text.startsWith("---")) {
2250
+ return content;
2251
+ }
2252
+ const end = text.indexOf("\n---", 3);
2253
+ if (end === -1) {
2254
+ return content;
2255
+ }
2256
+ return Buffer.from(text.slice(end + 4), "utf-8");
2257
+ }
1798
2258
  function fileHash(filePath) {
1799
- const content = (0, import_node_fs5.readFileSync)(filePath);
2259
+ const raw = (0, import_node_fs5.readFileSync)(filePath);
2260
+ const content = (0, import_node_path4.extname)(filePath).toLowerCase() === ".md" ? bodyContent(raw) : raw;
1800
2261
  const resolved = (0, import_node_path4.resolve)(filePath);
1801
2262
  const h = (0, import_node_crypto2.createHash)("sha256");
1802
2263
  h.update(content);
@@ -2744,10 +3205,10 @@ async function _extractGeneric(filePath, config) {
2744
3205
  source: callerNid,
2745
3206
  target: tgtNid,
2746
3207
  relation: "calls",
2747
- confidence: "INFERRED",
3208
+ confidence: "EXTRACTED",
2748
3209
  source_file: strPath,
2749
3210
  source_location: `L${line}`,
2750
- weight: 0.8
3211
+ weight: 1
2751
3212
  });
2752
3213
  }
2753
3214
  }
@@ -3270,10 +3731,10 @@ async function extractGo(filePath) {
3270
3731
  source: callerNid,
3271
3732
  target: tgtNid,
3272
3733
  relation: "calls",
3273
- confidence: "INFERRED",
3734
+ confidence: "EXTRACTED",
3274
3735
  source_file: strPath,
3275
3736
  source_location: `L${line}`,
3276
- weight: 0.8
3737
+ weight: 1
3277
3738
  });
3278
3739
  }
3279
3740
  }
@@ -3425,10 +3886,10 @@ async function extractRust(filePath) {
3425
3886
  source: callerNid,
3426
3887
  target: tgtNid,
3427
3888
  relation: "calls",
3428
- confidence: "INFERRED",
3889
+ confidence: "EXTRACTED",
3429
3890
  source_file: strPath,
3430
3891
  source_location: `L${line}`,
3431
- weight: 0.8
3892
+ weight: 1
3432
3893
  });
3433
3894
  }
3434
3895
  }
@@ -3587,7 +4048,7 @@ async function extractZig(filePath) {
3587
4048
  const pair = `${callerNid}|${tgtNid}`;
3588
4049
  if (!seenCallPairs.has(pair)) {
3589
4050
  seenCallPairs.add(pair);
3590
- addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "INFERRED", 0.8);
4051
+ addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "EXTRACTED", 1);
3591
4052
  }
3592
4053
  }
3593
4054
  }
@@ -3770,7 +4231,7 @@ async function extractPowershell(filePath) {
3770
4231
  const pair = `${callerNid}|${tgtNid}`;
3771
4232
  if (!seenCallPairs.has(pair)) {
3772
4233
  seenCallPairs.add(pair);
3773
- addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "INFERRED", 0.8);
4234
+ addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "EXTRACTED", 1);
3774
4235
  }
3775
4236
  }
3776
4237
  }
@@ -3976,7 +4437,7 @@ async function extractObjc(filePath) {
3976
4437
  const pair = `${callerNid}|${candidate}`;
3977
4438
  if (!seenCalls.has(pair) && callerNid !== candidate) {
3978
4439
  seenCalls.add(pair);
3979
- addEdge(callerNid, candidate, "calls", bodyNode.startPosition.row + 1, "INFERRED", 0.8);
4440
+ addEdge(callerNid, candidate, "calls", bodyNode.startPosition.row + 1, "EXTRACTED", 1);
3980
4441
  }
3981
4442
  }
3982
4443
  }
@@ -4167,7 +4628,7 @@ async function extractElixir(filePath) {
4167
4628
  const pair = `${callerNid}|${tgtNid}`;
4168
4629
  if (!seenCallPairs.has(pair)) {
4169
4630
  seenCallPairs.add(pair);
4170
- addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "INFERRED", 0.8);
4631
+ addEdge(callerNid, tgtNid, "calls", node.startPosition.row + 1, "EXTRACTED", 1);
4171
4632
  }
4172
4633
  }
4173
4634
  }
@@ -4696,8 +5157,10 @@ var src_exports = {};
4696
5157
  __export(src_exports, {
4697
5158
  FileType: () => FileType,
4698
5159
  assertValid: () => assertValid,
5160
+ augmentDetectionWithTranscripts: () => augmentDetectionWithTranscripts,
4699
5161
  build: () => build,
4700
5162
  buildFromJson: () => buildFromJson,
5163
+ buildWhisperPrompt: () => buildWhisperPrompt,
4701
5164
  checkSemanticCache: () => checkSemanticCache,
4702
5165
  classifyFile: () => classifyFile,
4703
5166
  cluster: () => cluster,
@@ -4705,6 +5168,7 @@ __export(src_exports, {
4705
5168
  collectFiles: () => collectFiles,
4706
5169
  detect: () => detect,
4707
5170
  detectIncremental: () => detectIncremental,
5171
+ downloadAudio: () => downloadAudio,
4708
5172
  extract: () => extract,
4709
5173
  fileHash: () => fileHash,
4710
5174
  generateReport: () => generate,
@@ -4734,23 +5198,15 @@ __export(src_exports, {
4734
5198
  toJson: () => toJson,
4735
5199
  toSvg: () => toSvg,
4736
5200
  toWiki: () => toWiki,
5201
+ transcribe: () => transcribe,
5202
+ transcribeAll: () => transcribeAll,
4737
5203
  validateExtraction: () => validateExtraction,
4738
5204
  validateGraphPath: () => validateGraphPath,
4739
5205
  validateUrl: () => validateUrl,
4740
5206
  watch: () => watch
4741
5207
  });
4742
5208
  module.exports = __toCommonJS(src_exports);
4743
-
4744
- // src/types.ts
4745
- var FileType = /* @__PURE__ */ ((FileType2) => {
4746
- FileType2["CODE"] = "code";
4747
- FileType2["DOCUMENT"] = "document";
4748
- FileType2["PAPER"] = "paper";
4749
- FileType2["IMAGE"] = "image";
4750
- return FileType2;
4751
- })(FileType || {});
4752
-
4753
- // src/index.ts
5209
+ init_types();
4754
5210
  init_validate();
4755
5211
  init_build();
4756
5212
  init_cluster();
@@ -4759,23 +5215,24 @@ init_report();
4759
5215
  init_export();
4760
5216
 
4761
5217
  // src/wiki.ts
4762
- var import_node_fs3 = require("fs");
4763
- var import_node_path2 = require("path");
5218
+ var import_node_fs4 = require("fs");
5219
+ var import_node_path3 = require("path");
4764
5220
  init_collections();
5221
+ init_graph();
4765
5222
  function safeFilename(name) {
4766
- return name.replace(/\//g, "-").replace(/ /g, "_").replace(/:/g, "-");
5223
+ return name.replace(/\r\n/g, " ").replace(/\r/g, " ").replace(/\n/g, " ").replace(/\//g, "-").replace(/ /g, "_").replace(/:/g, "-");
4767
5224
  }
4768
5225
  function crossCommunityLinks(G, nodes, ownCid, labels) {
4769
5226
  const labelMap = toNumericMap(labels);
4770
5227
  const counts = /* @__PURE__ */ new Map();
4771
5228
  for (const nid of nodes) {
4772
- G.forEachNeighbor(nid, (neighbor) => {
5229
+ for (const neighbor of traversalNeighbors(G, nid)) {
4773
5230
  const ncid = G.getNodeAttribute(neighbor, "community");
4774
5231
  if (ncid !== void 0 && ncid !== ownCid) {
4775
5232
  const label = labelMap.get(ncid) ?? `Community ${ncid}`;
4776
5233
  counts.set(label, (counts.get(label) ?? 0) + 1);
4777
5234
  }
4778
- });
5235
+ }
4779
5236
  }
4780
5237
  return [...counts.entries()].sort((a, b) => b[1] - a[1]);
4781
5238
  }
@@ -4839,15 +5296,15 @@ function godNodeArticle(G, nid, labels) {
4839
5296
  const nodeLabel = d.label ?? nid;
4840
5297
  const src = d.source_file ?? "";
4841
5298
  const cid = d.community;
4842
- const communityName = cid !== void 0 ? labels.get(cid) ?? `Community ${cid}` : void 0;
5299
+ const communityName2 = cid !== void 0 ? labels.get(cid) ?? `Community ${cid}` : void 0;
4843
5300
  const lines = [];
4844
5301
  lines.push(`# ${nodeLabel}`, "");
4845
5302
  lines.push(`> God node \xB7 ${G.degree(nid)} connections \xB7 \`${src}\``, "");
4846
- if (communityName) {
4847
- lines.push(`**Community:** [[${communityName}]]`, "");
5303
+ if (communityName2) {
5304
+ lines.push(`**Community:** [[${communityName2}]]`, "");
4848
5305
  }
4849
5306
  const byRelation = /* @__PURE__ */ new Map();
4850
- const neighbors = [...G.neighbors(nid)].sort((a, b) => G.degree(b) - G.degree(a));
5307
+ const neighbors = traversalNeighbors(G, nid).sort((a, b) => G.degree(b) - G.degree(a));
4851
5308
  for (const neighbor of neighbors) {
4852
5309
  const ed = G.getEdgeAttributes(G.edge(nid, neighbor));
4853
5310
  const rel = ed.relation ?? "related";
@@ -4860,408 +5317,85 @@ function godNodeArticle(G, nid, labels) {
4860
5317
  lines.push("## Connections by Relation", "");
4861
5318
  for (const [rel, targets] of [...byRelation.entries()].sort()) {
4862
5319
  lines.push(`### ${rel}`);
4863
- for (const t of targets.slice(0, 20)) {
4864
- lines.push(`- ${t}`);
4865
- }
4866
- lines.push("");
4867
- }
4868
- lines.push("---", "", "*Part of the graphify knowledge wiki. See [[index]] to navigate.*");
4869
- return lines.join("\n");
4870
- }
4871
- function indexMd(communities, labels, godNodesData, totalNodes, totalEdges) {
4872
- const lines = [
4873
- "# Knowledge Graph Index",
4874
- "",
4875
- "> Auto-generated by graphify. Start here \u2014 read community articles for context, then drill into god nodes for detail.",
4876
- "",
4877
- `**${totalNodes} nodes \xB7 ${totalEdges} edges \xB7 ${communities.size} communities**`,
4878
- "",
4879
- "---",
4880
- "",
4881
- "## Communities",
4882
- "(sorted by size, largest first)",
4883
- ""
4884
- ];
4885
- const sorted = [...communities.entries()].sort((a, b) => b[1].length - a[1].length);
4886
- for (const [cid, nodes] of sorted) {
4887
- const label = labels.get(cid) ?? `Community ${cid}`;
4888
- lines.push(`- [[${label}]] \u2014 ${nodes.length} nodes`);
4889
- }
4890
- lines.push("");
4891
- if (godNodesData.length > 0) {
4892
- lines.push("## God Nodes", "(most connected concepts \u2014 the load-bearing abstractions)", "");
4893
- for (const node of godNodesData) {
4894
- lines.push(`- [[${node.label}]] \u2014 ${node.edges} connections`);
4895
- }
4896
- lines.push("");
4897
- }
4898
- lines.push(
4899
- "---",
4900
- "",
4901
- "*Generated by [graphify](https://github.com/safishamsi/graphify)*"
4902
- );
4903
- return lines.join("\n");
4904
- }
4905
- function toWiki(G, communities, outputDir, options) {
4906
- const communityMap = toNumericMap(communities);
4907
- (0, import_node_fs3.mkdirSync)(outputDir, { recursive: true });
4908
- const labels = options?.communityLabels ? toNumericMap(options.communityLabels) : new Map([...communityMap.keys()].map((cid) => [cid, `Community ${cid}`]));
4909
- const cohesion = toNumericMap(options?.cohesion);
4910
- const godNodesData = options?.godNodesData ?? [];
4911
- let count = 0;
4912
- for (const [cid, nodes] of communityMap) {
4913
- const label = labels.get(cid) ?? `Community ${cid}`;
4914
- const article = communityArticle(G, cid, nodes, label, labels, cohesion.get(cid));
4915
- (0, import_node_fs3.writeFileSync)((0, import_node_path2.join)(outputDir, `${safeFilename(label)}.md`), article);
4916
- count++;
4917
- }
4918
- for (const nodeData of godNodesData) {
4919
- const nid = nodeData.id;
4920
- if (nid && G.hasNode(nid)) {
4921
- const article = godNodeArticle(G, nid, labels);
4922
- (0, import_node_fs3.writeFileSync)((0, import_node_path2.join)(outputDir, `${safeFilename(nodeData.label)}.md`), article);
4923
- count++;
4924
- }
4925
- }
4926
- (0, import_node_fs3.writeFileSync)(
4927
- (0, import_node_path2.join)(outputDir, "index.md"),
4928
- indexMd(communityMap, labels, godNodesData, G.order, G.size)
4929
- );
4930
- return count;
4931
- }
4932
-
4933
- // src/detect.ts
4934
- var import_node_fs4 = require("fs");
4935
- var import_node_path3 = require("path");
4936
- var import_node_crypto = require("crypto");
4937
- var MANIFEST_PATH = "graphify-out/manifest.json";
4938
- var CODE_EXTENSIONS2 = /* @__PURE__ */ new Set([
4939
- ".py",
4940
- ".ts",
4941
- ".js",
4942
- ".jsx",
4943
- ".tsx",
4944
- ".go",
4945
- ".rs",
4946
- ".java",
4947
- ".cpp",
4948
- ".cc",
4949
- ".cxx",
4950
- ".c",
4951
- ".h",
4952
- ".hpp",
4953
- ".rb",
4954
- ".swift",
4955
- ".kt",
4956
- ".kts",
4957
- ".cs",
4958
- ".scala",
4959
- ".php",
4960
- ".lua",
4961
- ".toc",
4962
- ".zig",
4963
- ".ps1",
4964
- ".ex",
4965
- ".exs",
4966
- ".m",
4967
- ".mm",
4968
- ".jl"
4969
- ]);
4970
- var DOC_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt", ".rst"]);
4971
- var PAPER_EXTENSIONS2 = /* @__PURE__ */ new Set([".pdf"]);
4972
- var IMAGE_EXTENSIONS2 = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg"]);
4973
- var OFFICE_EXTENSIONS = /* @__PURE__ */ new Set([".docx", ".xlsx"]);
4974
- var CORPUS_WARN_THRESHOLD = 5e4;
4975
- var CORPUS_UPPER_THRESHOLD = 5e5;
4976
- var FILE_COUNT_UPPER = 200;
4977
- var SENSITIVE_PATTERNS = [
4978
- /(^|[\\/])\.(env|envrc)(\.|$)/i,
4979
- /\.(pem|key|p12|pfx|cert|crt|der|p8)$/i,
4980
- /(credential|secret|passwd|password|token|private_key)/i,
4981
- /(id_rsa|id_dsa|id_ecdsa|id_ed25519)(\.pub)?$/,
4982
- /(\.netrc|\.pgpass|\.htpasswd)$/i,
4983
- /(aws_credentials|gcloud_credentials|service.account)/i
4984
- ];
4985
- var PAPER_SIGNALS = [
4986
- /\barxiv\b/i,
4987
- /\bdoi\s*:/i,
4988
- /\babstract\b/i,
4989
- /\bproceedings\b/i,
4990
- /\bjournal\b/i,
4991
- /\bpreprint\b/i,
4992
- /\\cite\{/,
4993
- /\[\d+\]/,
4994
- /\[\n\d+\n\]/,
4995
- /eq\.\s*\d+|equation\s+\d+/i,
4996
- /\d{4}\.\d{4,5}/,
4997
- /\bwe propose\b/i,
4998
- /\bliterature\b/i
4999
- ];
5000
- var PAPER_SIGNAL_THRESHOLD = 3;
5001
- function isSensitive(filePath) {
5002
- const name = (0, import_node_path3.basename)(filePath);
5003
- return SENSITIVE_PATTERNS.some((p) => p.test(name) || p.test(filePath));
5004
- }
5005
- function looksLikePaper(filePath) {
5006
- try {
5007
- const text = (0, import_node_fs4.readFileSync)(filePath, "utf-8").slice(0, 3e3);
5008
- const hits = PAPER_SIGNALS.filter((p) => p.test(text)).length;
5009
- return hits >= PAPER_SIGNAL_THRESHOLD;
5010
- } catch {
5011
- return false;
5012
- }
5013
- }
5014
- var ASSET_DIR_MARKERS = /* @__PURE__ */ new Set([".imageset", ".xcassets", ".appiconset", ".colorset", ".launchimage"]);
5015
- function classifyFile(filePath) {
5016
- const ext = (0, import_node_path3.extname)(filePath).toLowerCase();
5017
- if (CODE_EXTENSIONS2.has(ext)) return "code" /* CODE */;
5018
- if (PAPER_EXTENSIONS2.has(ext)) {
5019
- const parts = filePath.split(import_node_path3.sep);
5020
- if (parts.some((p) => [...ASSET_DIR_MARKERS].some((m) => p.endsWith(m)))) return null;
5021
- return "paper" /* PAPER */;
5022
- }
5023
- if (IMAGE_EXTENSIONS2.has(ext)) return "image" /* IMAGE */;
5024
- if (DOC_EXTENSIONS.has(ext)) {
5025
- if (looksLikePaper(filePath)) return "paper" /* PAPER */;
5026
- return "document" /* DOCUMENT */;
5027
- }
5028
- if (OFFICE_EXTENSIONS.has(ext)) return "document" /* DOCUMENT */;
5029
- return null;
5030
- }
5031
- function countWords(filePath) {
5032
- try {
5033
- const text = (0, import_node_fs4.readFileSync)(filePath, "utf-8");
5034
- return text.split(/\s+/).filter(Boolean).length;
5035
- } catch {
5036
- return 0;
5037
- }
5038
- }
5039
- var SKIP_DIRS = /* @__PURE__ */ new Set([
5040
- "venv",
5041
- ".venv",
5042
- "env",
5043
- ".env",
5044
- "node_modules",
5045
- "__pycache__",
5046
- ".git",
5047
- "dist",
5048
- "build",
5049
- "target",
5050
- "out",
5051
- "site-packages",
5052
- "lib64",
5053
- ".pytest_cache",
5054
- ".mypy_cache",
5055
- ".ruff_cache",
5056
- ".tox",
5057
- ".eggs"
5058
- ]);
5059
- function isNoiseDir(part) {
5060
- if (SKIP_DIRS.has(part)) return true;
5061
- if (part.endsWith("_venv") || part.endsWith("_env")) return true;
5062
- if (part.endsWith(".egg-info")) return true;
5063
- return false;
5064
- }
5065
- function loadGraphifyignore(root) {
5066
- const ignoreFile = (0, import_node_path3.join)(root, ".graphifyignore");
5067
- if (!(0, import_node_fs4.existsSync)(ignoreFile)) return [];
5068
- const patterns = [];
5069
- for (let line of (0, import_node_fs4.readFileSync)(ignoreFile, "utf-8").split("\n")) {
5070
- line = line.trim();
5071
- if (line && !line.startsWith("#")) {
5072
- patterns.push(line);
5073
- }
5074
- }
5075
- return patterns;
5076
- }
5077
- function matchGlob(text, pattern) {
5078
- const regex = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
5079
- return new RegExp(`^${regex}$`).test(text);
5080
- }
5081
- function isIgnored(filePath, root, patterns) {
5082
- if (patterns.length === 0) return false;
5083
- let rel;
5084
- try {
5085
- rel = (0, import_node_path3.relative)(root, filePath).replace(/\\/g, "/");
5086
- } catch {
5087
- return false;
5088
- }
5089
- const parts = rel.split("/");
5090
- for (const pattern of patterns) {
5091
- const p = pattern.replace(/^\/+|\/+$/g, "");
5092
- if (!p) continue;
5093
- if (matchGlob(rel, p)) return true;
5094
- if (matchGlob((0, import_node_path3.basename)(filePath), p)) return true;
5095
- for (let i = 0; i < parts.length; i++) {
5096
- if (matchGlob(parts[i], p)) return true;
5097
- if (matchGlob(parts.slice(0, i + 1).join("/"), p)) return true;
5098
- }
5099
- }
5100
- return false;
5101
- }
5102
- function walkDir(dir, root, ignorePatterns, followSymlinks, skipPrune) {
5103
- const result = [];
5104
- let entries;
5105
- try {
5106
- entries = (0, import_node_fs4.readdirSync)(dir);
5107
- } catch {
5108
- return result;
5109
- }
5110
- for (const entry of entries) {
5111
- const full = (0, import_node_path3.join)(dir, entry);
5112
- let stat;
5113
- try {
5114
- stat = followSymlinks ? (0, import_node_fs4.statSync)(full) : (0, import_node_fs4.lstatSync)(full);
5115
- } catch {
5116
- continue;
5117
- }
5118
- if (stat.isDirectory()) {
5119
- if (!skipPrune) {
5120
- if (entry.startsWith(".")) continue;
5121
- if (isNoiseDir(entry)) continue;
5122
- if (isIgnored(full, root, ignorePatterns)) continue;
5123
- }
5124
- result.push(...walkDir(full, root, ignorePatterns, followSymlinks, skipPrune));
5125
- } else if (stat.isFile()) {
5126
- result.push(full);
5127
- }
5128
- }
5129
- return result;
5130
- }
5131
- function detect(root, options) {
5132
- const followSymlinks = options?.followSymlinks ?? false;
5133
- const rootResolved = (0, import_node_path3.resolve)(root);
5134
- const ignorePatterns = loadGraphifyignore(rootResolved);
5135
- const convertedDir = (0, import_node_path3.join)(rootResolved, "graphify-out", "converted");
5136
- const memoryDir = (0, import_node_path3.join)(rootResolved, "graphify-out", "memory");
5137
- const files = {
5138
- code: [],
5139
- document: [],
5140
- paper: [],
5141
- image: []
5142
- };
5143
- let totalWords = 0;
5144
- const skippedSensitive = [];
5145
- const allFiles = walkDir(rootResolved, rootResolved, ignorePatterns, followSymlinks, false);
5146
- if ((0, import_node_fs4.existsSync)(memoryDir)) {
5147
- allFiles.push(...walkDir(memoryDir, rootResolved, ignorePatterns, followSymlinks, true));
5148
- }
5149
- const seen = /* @__PURE__ */ new Set();
5150
- for (const p of allFiles) {
5151
- if (seen.has(p)) continue;
5152
- seen.add(p);
5153
- const inMemory = (0, import_node_fs4.existsSync)(memoryDir) && p.startsWith(memoryDir);
5154
- if (!inMemory) {
5155
- if ((0, import_node_path3.basename)(p).startsWith(".")) continue;
5156
- if (p.startsWith(convertedDir)) continue;
5157
- }
5158
- if (isIgnored(p, rootResolved, ignorePatterns)) continue;
5159
- if (isSensitive(p)) {
5160
- skippedSensitive.push(p);
5161
- continue;
5162
- }
5163
- const ftype = classifyFile(p);
5164
- if (!ftype) continue;
5165
- if (OFFICE_EXTENSIONS.has((0, import_node_path3.extname)(p).toLowerCase())) {
5166
- skippedSensitive.push(p + " [office conversion requires async - use pipeline]");
5167
- continue;
5168
- }
5169
- files[ftype].push(p);
5170
- totalWords += countWords(p);
5171
- }
5172
- const totalFiles = Object.values(files).reduce((s, v) => s + v.length, 0);
5173
- const needsGraph = totalWords >= CORPUS_WARN_THRESHOLD;
5174
- let warning = null;
5175
- if (!needsGraph) {
5176
- warning = `Corpus is ~${totalWords.toLocaleString()} words - fits in a single context window. You may not need a graph.`;
5177
- } else if (totalWords >= CORPUS_UPPER_THRESHOLD || totalFiles >= FILE_COUNT_UPPER) {
5178
- warning = `Large corpus: ${totalFiles} files \xB7 ~${totalWords.toLocaleString()} words. Semantic extraction will be expensive (many Claude tokens). Consider running on a subfolder, or use --no-semantic to run AST-only.`;
5320
+ for (const t of targets.slice(0, 20)) {
5321
+ lines.push(`- ${t}`);
5322
+ }
5323
+ lines.push("");
5179
5324
  }
5180
- return {
5181
- files,
5182
- total_files: totalFiles,
5183
- total_words: totalWords,
5184
- needs_graph: needsGraph,
5185
- warning,
5186
- skipped_sensitive: skippedSensitive,
5187
- graphifyignore_patterns: ignorePatterns.length
5188
- };
5325
+ lines.push("---", "", "*Part of the graphify knowledge wiki. See [[index]] to navigate.*");
5326
+ return lines.join("\n");
5189
5327
  }
5190
- function loadManifest(manifestPath = MANIFEST_PATH) {
5191
- try {
5192
- return JSON.parse((0, import_node_fs4.readFileSync)(manifestPath, "utf-8"));
5193
- } catch {
5194
- return {};
5328
+ function indexMd(communities, labels, godNodesData, totalNodes, totalEdges) {
5329
+ const lines = [
5330
+ "# Knowledge Graph Index",
5331
+ "",
5332
+ "> Auto-generated by graphify. Start here \u2014 read community articles for context, then drill into god nodes for detail.",
5333
+ "",
5334
+ `**${totalNodes} nodes \xB7 ${totalEdges} edges \xB7 ${communities.size} communities**`,
5335
+ "",
5336
+ "---",
5337
+ "",
5338
+ "## Communities",
5339
+ "(sorted by size, largest first)",
5340
+ ""
5341
+ ];
5342
+ const sorted = [...communities.entries()].sort((a, b) => b[1].length - a[1].length);
5343
+ for (const [cid, nodes] of sorted) {
5344
+ const label = labels.get(cid) ?? `Community ${cid}`;
5345
+ lines.push(`- [[${label}]] \u2014 ${nodes.length} nodes`);
5195
5346
  }
5196
- }
5197
- function saveManifest(files, manifestPath = MANIFEST_PATH) {
5198
- const manifest = {};
5199
- for (const fileList of Object.values(files)) {
5200
- for (const f of fileList) {
5201
- try {
5202
- manifest[f] = (0, import_node_fs4.statSync)(f).mtimeMs;
5203
- } catch {
5204
- }
5347
+ lines.push("");
5348
+ if (godNodesData.length > 0) {
5349
+ lines.push("## God Nodes", "(most connected concepts \u2014 the load-bearing abstractions)", "");
5350
+ for (const node of godNodesData) {
5351
+ lines.push(`- [[${node.label}]] \u2014 ${node.edges} connections`);
5205
5352
  }
5353
+ lines.push("");
5206
5354
  }
5207
- const dir = (0, import_node_path3.join)(manifestPath, "..");
5208
- (0, import_node_fs4.mkdirSync)(dir, { recursive: true });
5209
- (0, import_node_fs4.writeFileSync)(manifestPath, JSON.stringify(manifest, null, 2));
5355
+ lines.push(
5356
+ "---",
5357
+ "",
5358
+ "*Generated by [graphify](https://github.com/safishamsi/graphify)*"
5359
+ );
5360
+ return lines.join("\n");
5210
5361
  }
5211
- function detectIncremental(root, manifestPath = MANIFEST_PATH) {
5212
- const full = detect(root);
5213
- const manifest = loadManifest(manifestPath);
5214
- if (Object.keys(manifest).length === 0) {
5215
- return {
5216
- ...full,
5217
- incremental: true,
5218
- new_files: full.files,
5219
- unchanged_files: Object.fromEntries(Object.keys(full.files).map((k) => [k, []])),
5220
- new_total: full.total_files
5221
- };
5222
- }
5223
- const newFiles = {};
5224
- const unchangedFiles = {};
5225
- for (const k of Object.keys(full.files)) {
5226
- newFiles[k] = [];
5227
- unchangedFiles[k] = [];
5362
+ function toWiki(G, communities, outputDir, options) {
5363
+ const communityMap = toNumericMap(communities);
5364
+ (0, import_node_fs4.mkdirSync)(outputDir, { recursive: true });
5365
+ const labels = options?.communityLabels ? toNumericMap(options.communityLabels) : new Map([...communityMap.keys()].map((cid) => [cid, `Community ${cid}`]));
5366
+ const cohesion = toNumericMap(options?.cohesion);
5367
+ const godNodesData = options?.godNodesData ?? [];
5368
+ let count = 0;
5369
+ for (const [cid, nodes] of communityMap) {
5370
+ const label = labels.get(cid) ?? `Community ${cid}`;
5371
+ const article = communityArticle(G, cid, nodes, label, labels, cohesion.get(cid));
5372
+ (0, import_node_fs4.writeFileSync)((0, import_node_path3.join)(outputDir, `${safeFilename(label)}.md`), article);
5373
+ count++;
5228
5374
  }
5229
- for (const [ftype, fileList] of Object.entries(full.files)) {
5230
- for (const f of fileList) {
5231
- const storedMtime = manifest[f];
5232
- let currentMtime = 0;
5233
- try {
5234
- currentMtime = (0, import_node_fs4.statSync)(f).mtimeMs;
5235
- } catch {
5236
- }
5237
- if (storedMtime === void 0 || currentMtime > storedMtime) {
5238
- newFiles[ftype].push(f);
5239
- } else {
5240
- unchangedFiles[ftype].push(f);
5241
- }
5375
+ for (const nodeData of godNodesData) {
5376
+ const nid = nodeData.id;
5377
+ if (nid && G.hasNode(nid)) {
5378
+ const article = godNodeArticle(G, nid, labels);
5379
+ (0, import_node_fs4.writeFileSync)((0, import_node_path3.join)(outputDir, `${safeFilename(nodeData.label)}.md`), article);
5380
+ count++;
5242
5381
  }
5243
5382
  }
5244
- const currentFiles = new Set(Object.values(full.files).flat());
5245
- const deletedFiles = Object.keys(manifest).filter((f) => !currentFiles.has(f));
5246
- const newTotal = Object.values(newFiles).reduce((s, v) => s + v.length, 0);
5247
- return {
5248
- ...full,
5249
- incremental: true,
5250
- new_files: newFiles,
5251
- unchanged_files: unchangedFiles,
5252
- new_total: newTotal,
5253
- deleted_files: deletedFiles
5254
- };
5383
+ (0, import_node_fs4.writeFileSync)(
5384
+ (0, import_node_path3.join)(outputDir, "index.md"),
5385
+ indexMd(communityMap, labels, godNodesData, G.order, G.size)
5386
+ );
5387
+ return count;
5255
5388
  }
5256
5389
 
5257
5390
  // src/index.ts
5391
+ init_detect();
5258
5392
  init_extract();
5259
5393
  init_cache();
5260
5394
  init_security();
5261
5395
 
5262
5396
  // src/benchmark.ts
5263
5397
  var import_node_fs7 = require("fs");
5264
- var import_graphology2 = __toESM(require("graphology"), 1);
5398
+ init_graph();
5265
5399
  var CHARS_PER_TOKEN = 4;
5266
5400
  function estimateTokens(text) {
5267
5401
  return Math.max(1, Math.floor(text.length / CHARS_PER_TOKEN));
@@ -5283,7 +5417,7 @@ function querySubgraphTokens(G, question, depth = 3) {
5283
5417
  for (let d = 0; d < depth; d++) {
5284
5418
  const nextFrontier = /* @__PURE__ */ new Set();
5285
5419
  for (const n of frontier) {
5286
- G.forEachNeighbor(n, (neighbor) => {
5420
+ forEachTraversalNeighbor(G, n, (neighbor) => {
5287
5421
  if (!visited.has(neighbor)) {
5288
5422
  nextFrontier.add(neighbor);
5289
5423
  edgesSeen.push([n, neighbor]);
@@ -5318,21 +5452,7 @@ var SAMPLE_QUESTIONS = [
5318
5452
  ];
5319
5453
  function loadGraph(graphPath) {
5320
5454
  const raw = JSON.parse((0, import_node_fs7.readFileSync)(graphPath, "utf-8"));
5321
- const G = new import_graphology2.default({ type: "undirected" });
5322
- for (const node of raw.nodes ?? []) {
5323
- const { id, ...attrs } = node;
5324
- G.mergeNode(id, attrs);
5325
- }
5326
- for (const link of raw.links ?? []) {
5327
- const { source, target, ...attrs } = link;
5328
- if (G.hasNode(source) && G.hasNode(target)) {
5329
- try {
5330
- G.mergeEdge(source, target, attrs);
5331
- } catch {
5332
- }
5333
- }
5334
- }
5335
- return G;
5455
+ return loadGraphFromData(raw);
5336
5456
  }
5337
5457
  function runBenchmark(graphPath = "graphify-out/graph.json", corpusWordsOrOptions, questions) {
5338
5458
  const options = typeof corpusWordsOrOptions === "number" ? { corpusWords: corpusWordsOrOptions, questions } : corpusWordsOrOptions ?? {};
@@ -5395,9 +5515,395 @@ graphify token reduction benchmark`);
5395
5515
  }
5396
5516
 
5397
5517
  // src/ingest.ts
5518
+ var import_node_fs9 = require("fs");
5519
+ var import_node_path7 = require("path");
5520
+ init_security();
5521
+
5522
+ // src/transcribe.ts
5523
+ var childProcess = __toESM(require("child_process"), 1);
5524
+ var import_node_crypto3 = require("crypto");
5398
5525
  var import_node_fs8 = require("fs");
5526
+ var import_node_os = require("os");
5399
5527
  var import_node_path6 = require("path");
5400
- init_security();
5528
+ var import_node_stream = require("stream");
5529
+ var import_promises = require("stream/promises");
5530
+ var URL_PREFIXES = ["http://", "https://", "www."];
5531
+ var CACHED_AUDIO_EXTENSIONS = [".m4a", ".opus", ".mp3", ".ogg", ".wav", ".webm"];
5532
+ var DEFAULT_MODEL = "base";
5533
+ var TRANSCRIPTS_DIR = "graphify-out/transcripts";
5534
+ var FALLBACK_PROMPT = "Use proper punctuation and paragraph breaks.";
5535
+ var SHERPA_RELEASE_BASE = "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models";
5536
+ var AUDIO_SAMPLE_RATE = 16e3;
5537
+ var SUPPORTED_MODELS = /* @__PURE__ */ new Set([
5538
+ "tiny",
5539
+ "tiny.en",
5540
+ "base",
5541
+ "base.en",
5542
+ "small",
5543
+ "small.en",
5544
+ "medium",
5545
+ "medium.en",
5546
+ "large-v1",
5547
+ "large-v2",
5548
+ "large-v3",
5549
+ "turbo",
5550
+ "distil-small.en",
5551
+ "distil-medium.en",
5552
+ "distil-large-v2",
5553
+ "distil-large-v3",
5554
+ "distil-large-v3.5"
5555
+ ]);
5556
+ var MODEL_ALIASES = {
5557
+ large: "large-v3"
5558
+ };
5559
+ var recognizerCache = /* @__PURE__ */ new Map();
5560
+ var sherpaModulePromise = null;
5561
+ function runCommand(command, args, options) {
5562
+ const result = childProcess.spawnSync(command, args, {
5563
+ encoding: "utf-8",
5564
+ ...options
5565
+ });
5566
+ if (result.error) {
5567
+ throw result.error;
5568
+ }
5569
+ if (result.status !== 0) {
5570
+ throw new Error(result.stderr?.trim() || result.stdout?.trim() || `${command} failed`);
5571
+ }
5572
+ return result;
5573
+ }
5574
+ function defaultWhisperCacheDir() {
5575
+ if (process.env.GRAPHIFY_WHISPER_CACHE_DIR) {
5576
+ return (0, import_node_path6.resolve)(process.env.GRAPHIFY_WHISPER_CACHE_DIR);
5577
+ }
5578
+ if ((0, import_node_os.platform)() === "win32") {
5579
+ return (0, import_node_path6.join)(
5580
+ process.env.LOCALAPPDATA ?? (0, import_node_path6.join)((0, import_node_os.homedir)(), "AppData", "Local"),
5581
+ "graphify",
5582
+ "whisper"
5583
+ );
5584
+ }
5585
+ return (0, import_node_path6.join)(process.env.XDG_CACHE_HOME ?? (0, import_node_path6.join)((0, import_node_os.homedir)(), ".cache"), "graphify", "whisper");
5586
+ }
5587
+ function ffmpegBinary() {
5588
+ return process.env.GRAPHIFY_FFMPEG_BIN ?? "ffmpeg";
5589
+ }
5590
+ function tarBinary() {
5591
+ return process.env.GRAPHIFY_TAR_BIN ?? "tar";
5592
+ }
5593
+ function resolveRequestedModel(modelName) {
5594
+ const requested = modelName ?? process.env.GRAPHIFY_WHISPER_MODEL ?? DEFAULT_MODEL;
5595
+ const resolved = MODEL_ALIASES[requested] ?? requested;
5596
+ if (!SUPPORTED_MODELS.has(resolved)) {
5597
+ throw new Error(
5598
+ `Unsupported GRAPHIFY_WHISPER_MODEL "${requested}". Supported local TS models: ${[...SUPPORTED_MODELS].sort().join(", ")}`
5599
+ );
5600
+ }
5601
+ return { requested, resolved };
5602
+ }
5603
+ function walkFiles(dir) {
5604
+ if (!(0, import_node_fs8.existsSync)(dir)) return [];
5605
+ const files = [];
5606
+ for (const entry of (0, import_node_fs8.readdirSync)(dir, { withFileTypes: true })) {
5607
+ const fullPath = (0, import_node_path6.join)(dir, entry.name);
5608
+ if (entry.isDirectory()) {
5609
+ files.push(...walkFiles(fullPath));
5610
+ } else {
5611
+ files.push(fullPath);
5612
+ }
5613
+ }
5614
+ return files;
5615
+ }
5616
+ function findArtifactsIn(dir) {
5617
+ const files = walkFiles(dir);
5618
+ const encoderPath = files.find((path) => path.endsWith("-encoder.int8.onnx")) ?? files.find((path) => path.endsWith("-encoder.onnx"));
5619
+ const decoderPath = files.find((path) => path.endsWith("-decoder.int8.onnx")) ?? files.find((path) => path.endsWith("-decoder.onnx"));
5620
+ const tokensPath = files.find((path) => path.endsWith("-tokens.txt"));
5621
+ if (!encoderPath || !decoderPath || !tokensPath) {
5622
+ return null;
5623
+ }
5624
+ return {
5625
+ modelDir: dir,
5626
+ encoderPath,
5627
+ decoderPath,
5628
+ tokensPath
5629
+ };
5630
+ }
5631
+ function normalizeModelError(detail) {
5632
+ if (detail.includes("404")) {
5633
+ return `${detail}. The local sherpa-onnx release asset was not found for this Whisper model name.`;
5634
+ }
5635
+ return detail;
5636
+ }
5637
+ async function writeResponseToFile(response, destination) {
5638
+ if (!response.ok || !response.body) {
5639
+ throw new Error(`HTTP ${response.status} while downloading ${response.url}`);
5640
+ }
5641
+ await (0, import_promises.pipeline)(import_node_stream.Readable.fromWeb(response.body), (0, import_node_fs8.createWriteStream)(destination));
5642
+ }
5643
+ async function ensureWhisperArtifacts(modelName) {
5644
+ const { requested, resolved } = resolveRequestedModel(modelName);
5645
+ const cacheRoot = defaultWhisperCacheDir();
5646
+ (0, import_node_fs8.mkdirSync)(cacheRoot, { recursive: true });
5647
+ const modelDir = (0, import_node_path6.join)(cacheRoot, `sherpa-onnx-whisper-${resolved}`);
5648
+ const cached = findArtifactsIn(modelDir);
5649
+ if (cached) {
5650
+ return { requestedModel: requested, resolvedModel: resolved, ...cached };
5651
+ }
5652
+ const tempDir = (0, import_node_fs8.mkdtempSync)((0, import_node_path6.join)((0, import_node_os.tmpdir)(), "graphify-whisper-model-"));
5653
+ const extractDir = (0, import_node_path6.join)(tempDir, "extract");
5654
+ const archiveName = `sherpa-onnx-whisper-${resolved}.tar.bz2`;
5655
+ const archivePath = (0, import_node_path6.join)(tempDir, archiveName);
5656
+ (0, import_node_fs8.mkdirSync)(extractDir, { recursive: true });
5657
+ try {
5658
+ const url = `${SHERPA_RELEASE_BASE}/${archiveName}`;
5659
+ console.log(` downloading whisper model: ${resolved}`);
5660
+ const response = await fetch(url);
5661
+ await writeResponseToFile(response, archivePath);
5662
+ runCommand(tarBinary(), ["-xjf", archivePath, "-C", extractDir]);
5663
+ const extractedRoot = walkFiles(extractDir).map((path) => (0, import_node_path6.dirname)(path)).find((path) => findArtifactsIn(path) !== null);
5664
+ const sourceDir = extractedRoot ?? (0, import_node_fs8.readdirSync)(extractDir, { withFileTypes: true }).filter((entry) => entry.isDirectory()).map((entry) => (0, import_node_path6.join)(extractDir, entry.name)).find((path) => findArtifactsIn(path) !== null);
5665
+ if (!sourceDir) {
5666
+ throw new Error(`Downloaded archive for ${resolved} but could not locate Whisper model files`);
5667
+ }
5668
+ if ((0, import_node_fs8.existsSync)(modelDir)) {
5669
+ (0, import_node_fs8.rmSync)(modelDir, { recursive: true, force: true });
5670
+ }
5671
+ try {
5672
+ (0, import_node_fs8.renameSync)(sourceDir, modelDir);
5673
+ } catch {
5674
+ (0, import_node_fs8.cpSync)(sourceDir, modelDir, { recursive: true });
5675
+ }
5676
+ const artifacts = findArtifactsIn(modelDir);
5677
+ if (!artifacts) {
5678
+ throw new Error(`Model cache for ${resolved} is incomplete after extraction`);
5679
+ }
5680
+ return { requestedModel: requested, resolvedModel: resolved, ...artifacts };
5681
+ } catch (error) {
5682
+ const detail = error instanceof Error ? error.message : String(error);
5683
+ throw new Error(normalizeModelError(detail));
5684
+ } finally {
5685
+ (0, import_node_fs8.rmSync)(tempDir, { recursive: true, force: true });
5686
+ }
5687
+ }
5688
+ async function loadSherpaModule() {
5689
+ if (!sherpaModulePromise) {
5690
+ sherpaModulePromise = import("sherpa-onnx-node").then((imported) => Reflect.has(imported, "default") ? Reflect.get(imported, "default") : imported).catch((error) => {
5691
+ sherpaModulePromise = null;
5692
+ const detail = error instanceof Error ? error.message : String(error);
5693
+ throw new Error(
5694
+ `Video transcription requires the optional dependency sherpa-onnx-node. Install it locally, then retry. ${detail}`
5695
+ );
5696
+ });
5697
+ }
5698
+ return sherpaModulePromise;
5699
+ }
5700
+ async function getRecognizer(modelName, sherpa) {
5701
+ const artifacts = await ensureWhisperArtifacts(modelName);
5702
+ const cacheKey = artifacts.modelDir;
5703
+ const existing = recognizerCache.get(cacheKey);
5704
+ if (existing) {
5705
+ return { recognizer: await existing, artifacts };
5706
+ }
5707
+ const createRecognizer = (async () => {
5708
+ const runtime = sherpa ?? await loadSherpaModule();
5709
+ return runtime.OfflineRecognizer.createAsync({
5710
+ featConfig: {
5711
+ sampleRate: AUDIO_SAMPLE_RATE,
5712
+ featureDim: 80
5713
+ },
5714
+ modelConfig: {
5715
+ whisper: {
5716
+ encoder: artifacts.encoderPath,
5717
+ decoder: artifacts.decoderPath,
5718
+ task: "transcribe"
5719
+ },
5720
+ tokens: artifacts.tokensPath,
5721
+ numThreads: 1,
5722
+ provider: "cpu",
5723
+ debug: 0
5724
+ }
5725
+ });
5726
+ })();
5727
+ recognizerCache.set(
5728
+ cacheKey,
5729
+ createRecognizer.catch((error) => {
5730
+ recognizerCache.delete(cacheKey);
5731
+ throw error;
5732
+ })
5733
+ );
5734
+ return { recognizer: await recognizerCache.get(cacheKey), artifacts };
5735
+ }
5736
+ function normalizeToWave(audioPath, workingDir) {
5737
+ const wavPath = (0, import_node_path6.join)(workingDir, `${(0, import_node_path6.basename)(audioPath, (0, import_node_path6.extname)(audioPath))}.wav`);
5738
+ try {
5739
+ runCommand(ffmpegBinary(), [
5740
+ "-y",
5741
+ "-i",
5742
+ audioPath,
5743
+ "-vn",
5744
+ "-ac",
5745
+ "1",
5746
+ "-ar",
5747
+ String(AUDIO_SAMPLE_RATE),
5748
+ "-c:a",
5749
+ "pcm_s16le",
5750
+ wavPath
5751
+ ]);
5752
+ } catch (error) {
5753
+ const detail = error instanceof Error ? error.message : String(error);
5754
+ throw new Error(
5755
+ `Video transcription requires ffmpeg in PATH. Install ffmpeg locally, then retry. ${detail}`
5756
+ );
5757
+ }
5758
+ return wavPath;
5759
+ }
5760
+ function extractTranscriptText(result) {
5761
+ return String(result.text ?? "").trim();
5762
+ }
5763
+ function isUrl(pathLike) {
5764
+ return URL_PREFIXES.some((prefix) => pathLike.startsWith(prefix));
5765
+ }
5766
+ function downloadAudio(url, outputDir) {
5767
+ (0, import_node_fs8.mkdirSync)(outputDir, { recursive: true });
5768
+ const urlHash = (0, import_node_crypto3.createHash)("sha1").update(url).digest("hex").slice(0, 12);
5769
+ for (const ext of CACHED_AUDIO_EXTENSIONS) {
5770
+ const candidate = (0, import_node_path6.join)(outputDir, `yt_${urlHash}${ext}`);
5771
+ if ((0, import_node_fs8.existsSync)(candidate)) {
5772
+ console.log(` cached audio: ${(0, import_node_path6.basename)(candidate)}`);
5773
+ return candidate;
5774
+ }
5775
+ }
5776
+ const outTemplate = (0, import_node_path6.join)(outputDir, `yt_${urlHash}.%(ext)s`);
5777
+ try {
5778
+ console.log(` downloading audio: ${url.slice(0, 80)} ...`);
5779
+ runCommand("yt-dlp", [
5780
+ "-f",
5781
+ "bestaudio[ext=m4a]/bestaudio/best",
5782
+ "-o",
5783
+ outTemplate,
5784
+ "--quiet",
5785
+ "--no-warnings",
5786
+ "--no-playlist",
5787
+ url
5788
+ ]);
5789
+ } catch (error) {
5790
+ const detail = error instanceof Error ? error.message : String(error);
5791
+ throw new Error(
5792
+ `YouTube/URL download requires yt-dlp. Install yt-dlp to enable video ingestion. ${detail}`
5793
+ );
5794
+ }
5795
+ for (const entry of (0, import_node_fs8.readdirSync)(outputDir)) {
5796
+ if (entry.startsWith(`yt_${urlHash}.`)) {
5797
+ return (0, import_node_path6.join)(outputDir, entry);
5798
+ }
5799
+ }
5800
+ throw new Error(`yt-dlp finished without producing an audio file for ${url}`);
5801
+ }
5802
+ function buildWhisperPrompt(godNodes2) {
5803
+ const override = process.env.GRAPHIFY_WHISPER_PROMPT;
5804
+ if (override) return override;
5805
+ const labels = godNodes2.map((node) => node.label ?? "").filter((label) => Boolean(label)).slice(0, 5);
5806
+ if (labels.length === 0) {
5807
+ return FALLBACK_PROMPT;
5808
+ }
5809
+ return `Technical discussion about ${labels.join(", ")}. ${FALLBACK_PROMPT}`;
5810
+ }
5811
+ async function transcribe(videoPath, outputDir = TRANSCRIPTS_DIR, initialPrompt, force = false) {
5812
+ const outDir = (0, import_node_path6.resolve)(outputDir);
5813
+ (0, import_node_fs8.mkdirSync)(outDir, { recursive: true });
5814
+ const audioPath = isUrl(videoPath) ? downloadAudio(videoPath, (0, import_node_path6.join)(outDir, "downloads")) : (0, import_node_path6.resolve)(videoPath);
5815
+ const transcriptPath = (0, import_node_path6.join)(outDir, `${(0, import_node_path6.basename)(audioPath, (0, import_node_path6.extname)(audioPath))}.txt`);
5816
+ if ((0, import_node_fs8.existsSync)(transcriptPath) && !force) {
5817
+ return transcriptPath;
5818
+ }
5819
+ const prompt = initialPrompt ?? process.env.GRAPHIFY_WHISPER_PROMPT ?? FALLBACK_PROMPT;
5820
+ const requestedModel = process.env.GRAPHIFY_WHISPER_MODEL ?? DEFAULT_MODEL;
5821
+ const tempDir = (0, import_node_fs8.mkdtempSync)((0, import_node_path6.join)((0, import_node_os.tmpdir)(), "graphify-transcribe-"));
5822
+ try {
5823
+ console.log(` transcribing ${(0, import_node_path6.basename)(audioPath)} (model=${requestedModel}) ...`);
5824
+ const wavPath = normalizeToWave(audioPath, tempDir);
5825
+ const sherpa = await loadSherpaModule();
5826
+ const { recognizer, artifacts } = await getRecognizer(requestedModel, sherpa);
5827
+ const wave = sherpa.readWave(wavPath);
5828
+ const stream = recognizer.createStream();
5829
+ if (prompt && typeof stream.setOption === "function") {
5830
+ try {
5831
+ stream.setOption("prompt", prompt);
5832
+ } catch {
5833
+ }
5834
+ }
5835
+ stream.acceptWaveform({ samples: wave.samples, sampleRate: wave.sampleRate });
5836
+ const result = await recognizer.decodeAsync(stream);
5837
+ const transcript = extractTranscriptText(result);
5838
+ (0, import_node_fs8.writeFileSync)(transcriptPath, transcript, "utf-8");
5839
+ if (artifacts.requestedModel !== artifacts.resolvedModel) {
5840
+ console.log(` model alias: ${artifacts.requestedModel} -> ${artifacts.resolvedModel}`);
5841
+ }
5842
+ } catch (error) {
5843
+ if (error instanceof Error && error.message.startsWith("Unsupported GRAPHIFY_WHISPER_MODEL")) {
5844
+ throw error;
5845
+ }
5846
+ const detail = error instanceof Error ? error.message : String(error);
5847
+ throw new Error(
5848
+ `Video transcription requires the local TypeScript toolchain: sherpa-onnx-node + ffmpeg. Retry after installing them. ${detail}`
5849
+ );
5850
+ } finally {
5851
+ (0, import_node_fs8.rmSync)(tempDir, { recursive: true, force: true });
5852
+ }
5853
+ return transcriptPath;
5854
+ }
5855
+ async function transcribeAll(videoFiles, outputDir, initialPrompt, force = false) {
5856
+ if (videoFiles.length === 0) {
5857
+ return [];
5858
+ }
5859
+ const transcriptPaths = [];
5860
+ for (const videoFile of videoFiles) {
5861
+ try {
5862
+ transcriptPaths.push(await transcribe(videoFile, outputDir, initialPrompt, force));
5863
+ } catch (error) {
5864
+ const detail = error instanceof Error ? error.message : String(error);
5865
+ console.log(` warning: could not transcribe ${videoFile}: ${detail}`);
5866
+ }
5867
+ }
5868
+ return transcriptPaths;
5869
+ }
5870
+ function cloneDetection(detection) {
5871
+ return JSON.parse(JSON.stringify(detection));
5872
+ }
5873
+ async function augmentDetectionWithTranscripts(detection, options) {
5874
+ const nextDetection = cloneDetection(detection);
5875
+ const source = options?.incremental && nextDetection.new_files ? nextDetection.new_files : nextDetection.files;
5876
+ const videoFiles = [...source.video ?? []];
5877
+ const prompt = options?.initialPrompt ?? buildWhisperPrompt(options?.godNodes ?? []);
5878
+ if (videoFiles.length === 0) {
5879
+ return { detection: nextDetection, transcriptPaths: [], prompt };
5880
+ }
5881
+ const previousModel = process.env.GRAPHIFY_WHISPER_MODEL;
5882
+ if (options?.whisperModel) {
5883
+ process.env.GRAPHIFY_WHISPER_MODEL = options.whisperModel;
5884
+ }
5885
+ try {
5886
+ const transcriptPaths = await transcribeAll(
5887
+ videoFiles,
5888
+ options?.outputDir,
5889
+ prompt,
5890
+ options?.incremental === true
5891
+ );
5892
+ const existingDocuments = source.document ?? [];
5893
+ source.document = [...existingDocuments, ...transcriptPaths];
5894
+ return { detection: nextDetection, transcriptPaths, prompt };
5895
+ } finally {
5896
+ if (options?.whisperModel) {
5897
+ if (previousModel === void 0) {
5898
+ delete process.env.GRAPHIFY_WHISPER_MODEL;
5899
+ } else {
5900
+ process.env.GRAPHIFY_WHISPER_MODEL = previousModel;
5901
+ }
5902
+ }
5903
+ }
5904
+ }
5905
+
5906
+ // src/ingest.ts
5401
5907
  function yamlStr(s) {
5402
5908
  return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, " ").replace(/\r/g, " ");
5403
5909
  }
@@ -5563,9 +6069,9 @@ Source: ${url}
5563
6069
  }
5564
6070
  async function downloadBinary(url, suffix, targetDir) {
5565
6071
  const filename = safeFilename2(url, suffix);
5566
- const outPath = (0, import_node_path6.resolve)(targetDir, filename);
6072
+ const outPath = (0, import_node_path7.resolve)(targetDir, filename);
5567
6073
  const data = await safeFetch(url);
5568
- (0, import_node_fs8.writeFileSync)(outPath, data);
6074
+ (0, import_node_fs9.writeFileSync)(outPath, data);
5569
6075
  return outPath;
5570
6076
  }
5571
6077
  function normalizeIngestOptions(authorOrOptions, contributor) {
@@ -5581,7 +6087,7 @@ function normalizeIngestOptions(authorOrOptions, contributor) {
5581
6087
  };
5582
6088
  }
5583
6089
  async function ingest(url, targetDir, authorOrOptions = null, contributor = null) {
5584
- (0, import_node_fs8.mkdirSync)(targetDir, { recursive: true });
6090
+ (0, import_node_fs9.mkdirSync)(targetDir, { recursive: true });
5585
6091
  const urlType = detectUrlType(url);
5586
6092
  const { author, contributor: normalizedContributor } = normalizeIngestOptions(
5587
6093
  authorOrOptions,
@@ -5592,7 +6098,7 @@ async function ingest(url, targetDir, authorOrOptions = null, contributor = null
5592
6098
  let filename;
5593
6099
  if (urlType === "pdf") {
5594
6100
  const out = await downloadBinary(url, ".pdf", targetDir);
5595
- console.log(`Downloaded PDF: ${(0, import_node_path6.basename)(out)}`);
6101
+ console.log(`Downloaded PDF: ${(0, import_node_path7.basename)(out)}`);
5596
6102
  return out;
5597
6103
  }
5598
6104
  if (urlType === "image") {
@@ -5602,9 +6108,14 @@ async function ingest(url, targetDir, authorOrOptions = null, contributor = null
5602
6108
  } catch {
5603
6109
  throw new Error(`Invalid URL: ${url}`);
5604
6110
  }
5605
- const suffix = (0, import_node_path6.extname)(parsed.pathname) || ".jpg";
6111
+ const suffix = (0, import_node_path7.extname)(parsed.pathname) || ".jpg";
5606
6112
  const out = await downloadBinary(url, suffix, targetDir);
5607
- console.log(`Downloaded image: ${(0, import_node_path6.basename)(out)}`);
6113
+ console.log(`Downloaded image: ${(0, import_node_path7.basename)(out)}`);
6114
+ return out;
6115
+ }
6116
+ if (urlType === "youtube") {
6117
+ const out = downloadAudio(url, targetDir);
6118
+ console.log(`Downloaded audio: ${(0, import_node_path7.basename)(out)}`);
5608
6119
  return out;
5609
6120
  }
5610
6121
  if (urlType === "tweet") {
@@ -5614,15 +6125,15 @@ async function ingest(url, targetDir, authorOrOptions = null, contributor = null
5614
6125
  } else {
5615
6126
  [content, filename] = await fetchWebpage(url, author, normalizedContributor);
5616
6127
  }
5617
- let outPath = (0, import_node_path6.resolve)(targetDir, filename);
6128
+ let outPath = (0, import_node_path7.resolve)(targetDir, filename);
5618
6129
  let counter = 1;
5619
- while ((0, import_node_fs8.existsSync)(outPath)) {
6130
+ while ((0, import_node_fs9.existsSync)(outPath)) {
5620
6131
  const stem = filename.replace(/\.md$/, "");
5621
- outPath = (0, import_node_path6.resolve)(targetDir, `${stem}_${counter}.md`);
6132
+ outPath = (0, import_node_path7.resolve)(targetDir, `${stem}_${counter}.md`);
5622
6133
  counter++;
5623
6134
  }
5624
- (0, import_node_fs8.writeFileSync)(outPath, content, "utf-8");
5625
- console.log(`Saved ${urlType}: ${(0, import_node_path6.basename)(outPath)}`);
6135
+ (0, import_node_fs9.writeFileSync)(outPath, content, "utf-8");
6136
+ console.log(`Saved ${urlType}: ${(0, import_node_path7.basename)(outPath)}`);
5626
6137
  return outPath;
5627
6138
  }
5628
6139
  function saveQueryResult(questionOrOptions, answer, memoryDir, queryType = "query", sourceNodes = null) {
@@ -5642,7 +6153,7 @@ function saveQueryResult(questionOrOptions, answer, memoryDir, queryType = "quer
5642
6153
  if (!payload.question) throw new Error("saveQueryResult requires a question");
5643
6154
  if (!payload.memoryDir) throw new Error("saveQueryResult requires a memoryDir");
5644
6155
  const effectiveAnswer = payload.answer ?? "";
5645
- (0, import_node_fs8.mkdirSync)(payload.memoryDir, { recursive: true });
6156
+ (0, import_node_fs9.mkdirSync)(payload.memoryDir, { recursive: true });
5646
6157
  const now = /* @__PURE__ */ new Date();
5647
6158
  const slug = payload.question.toLowerCase().replace(/[^\w]/g, "_").slice(0, 50).replace(/_+$/, "");
5648
6159
  const ts = now.toISOString().replace(/[-:]/g, "").replace("T", "_").slice(0, 15);
@@ -5674,11 +6185,11 @@ function saveQueryResult(questionOrOptions, answer, memoryDir, queryType = "quer
5674
6185
  }
5675
6186
  }
5676
6187
  const content = [...frontmatterLines, ...bodyLines].join("\n");
5677
- const outPath = (0, import_node_path6.resolve)(payload.memoryDir, filename);
5678
- (0, import_node_fs8.writeFileSync)(outPath, content, "utf-8");
6188
+ const outPath = (0, import_node_path7.resolve)(payload.memoryDir, filename);
6189
+ (0, import_node_fs9.writeFileSync)(outPath, content, "utf-8");
5679
6190
  return outPath;
5680
6191
  }
5681
- var isDirectExecution = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^ingest\.(?:js|mjs|cjs|ts)$/.test((0, import_node_path6.basename)(process.argv[1]));
6192
+ var isDirectExecution = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^ingest\.(?:js|mjs|cjs|ts)$/.test((0, import_node_path7.basename)(process.argv[1]));
5682
6193
  if (isDirectExecution) {
5683
6194
  const url = process.argv[2];
5684
6195
  const targetDir = process.argv[3] ?? "./raw";
@@ -5694,44 +6205,30 @@ if (isDirectExecution) {
5694
6205
  }
5695
6206
 
5696
6207
  // src/serve.ts
5697
- var import_node_fs9 = require("fs");
5698
- var import_graphology3 = __toESM(require("graphology"), 1);
6208
+ var import_node_fs10 = require("fs");
5699
6209
  var import_unweighted = require("graphology-shortest-path/unweighted.js");
5700
- var import_node_path7 = require("path");
6210
+ var import_node_path8 = require("path");
6211
+ init_graph();
5701
6212
  init_security();
5702
6213
  init_analyze();
5703
6214
  function loadGraph2(graphPath) {
5704
6215
  let safePath;
5705
6216
  try {
5706
- safePath = validateGraphPath(graphPath);
6217
+ safePath = validateGraphPath(graphPath, (0, import_node_path8.dirname)((0, import_node_path8.resolve)(graphPath)));
5707
6218
  } catch (err) {
5708
6219
  console.error(`error: ${err instanceof Error ? err.message : err}`);
5709
6220
  process.exit(1);
5710
6221
  }
5711
6222
  let data;
5712
6223
  try {
5713
- data = JSON.parse((0, import_node_fs9.readFileSync)(safePath, "utf-8"));
6224
+ data = JSON.parse((0, import_node_fs10.readFileSync)(safePath, "utf-8"));
5714
6225
  } catch (err) {
5715
6226
  console.error(
5716
6227
  `error: graph.json is corrupted (${err instanceof Error ? err.message : err}). Re-run the graphify skill to rebuild it (for Codex: $graphify .).`
5717
6228
  );
5718
6229
  process.exit(1);
5719
6230
  }
5720
- const G = new import_graphology3.default({ type: "undirected", multi: false });
5721
- const nodes = data.nodes ?? [];
5722
- for (const node of nodes) {
5723
- const { id, ...attrs } = node;
5724
- G.mergeNode(id, attrs);
5725
- }
5726
- const links = data.links ?? data.edges ?? [];
5727
- for (const link of links) {
5728
- const { source, target, ...attrs } = link;
5729
- try {
5730
- G.mergeEdge(source, target, attrs);
5731
- } catch {
5732
- }
5733
- }
5734
- return G;
6231
+ return loadGraphFromData(data);
5735
6232
  }
5736
6233
  function communitiesFromGraph(G) {
5737
6234
  const communities = /* @__PURE__ */ new Map();
@@ -5744,6 +6241,15 @@ function communitiesFromGraph(G) {
5744
6241
  });
5745
6242
  return communities;
5746
6243
  }
6244
+ function communityName(G, cid) {
6245
+ if (cid === void 0 || cid === null) return null;
6246
+ const labels = G.getAttribute("community_labels");
6247
+ const fromGraph = labels?.[String(cid)];
6248
+ if (typeof fromGraph === "string" && fromGraph.length > 0) {
6249
+ return sanitizeLabel(fromGraph);
6250
+ }
6251
+ return null;
6252
+ }
5747
6253
  function scoreNodes(G, terms) {
5748
6254
  const scored = [];
5749
6255
  G.forEachNode((nid, data) => {
@@ -5762,7 +6268,7 @@ function bfs(G, startNodes, depth) {
5762
6268
  for (let i = 0; i < depth; i++) {
5763
6269
  const nextFrontier = /* @__PURE__ */ new Set();
5764
6270
  for (const n of frontier) {
5765
- G.forEachNeighbor(n, (neighbor) => {
6271
+ forEachTraversalNeighbor(G, n, (neighbor) => {
5766
6272
  if (!visited.has(neighbor)) {
5767
6273
  nextFrontier.add(neighbor);
5768
6274
  edges.push([n, neighbor]);
@@ -5782,7 +6288,7 @@ function dfs(G, startNodes, depth) {
5782
6288
  const [node, d] = stack.pop();
5783
6289
  if (visited.has(node) || d > depth) continue;
5784
6290
  visited.add(node);
5785
- G.forEachNeighbor(node, (neighbor) => {
6291
+ forEachTraversalNeighbor(G, node, (neighbor) => {
5786
6292
  if (!visited.has(neighbor)) {
5787
6293
  stack.push([neighbor, d + 1]);
5788
6294
  edges.push([node, neighbor]);
@@ -5861,7 +6367,7 @@ function toolGetNode(G, args) {
5861
6367
  ` ID: ${nid}`,
5862
6368
  ` Source: ${d.source_file ?? ""} ${d.source_location ?? ""}`,
5863
6369
  ` Type: ${d.file_type ?? ""}`,
5864
- ` Community: ${d.community ?? ""}`,
6370
+ ` Community: ${d.community_name ? `${d.community ?? ""} (${d.community_name})` : communityName(G, d.community) ?? String(d.community ?? "")}`,
5865
6371
  ` Degree: ${G.degree(nid)}`
5866
6372
  ].join("\n");
5867
6373
  }
@@ -5872,7 +6378,7 @@ function toolGetNeighbors(G, args) {
5872
6378
  if (matches.length === 0) return `No node matching '${label}' found.`;
5873
6379
  const nid = matches[0];
5874
6380
  const lines = [`Neighbors of ${G.getNodeAttribute(nid, "label") ?? nid}:`];
5875
- G.forEachNeighbor(nid, (neighbor) => {
6381
+ forEachTraversalNeighbor(G, nid, (neighbor) => {
5876
6382
  const edgeKey = G.edge(nid, neighbor);
5877
6383
  if (!edgeKey) return;
5878
6384
  const d = G.getEdgeAttributes(edgeKey);
@@ -5888,7 +6394,8 @@ function toolGetCommunity(communities, G, args) {
5888
6394
  const cid = Number(args.community_id);
5889
6395
  const nodes = communities.get(cid);
5890
6396
  if (!nodes || nodes.length === 0) return `Community ${cid} not found.`;
5891
- const lines = [`Community ${cid} (${nodes.length} nodes):`];
6397
+ const label = communityName(G, cid);
6398
+ const lines = [label ? `Community ${cid} - ${label} (${nodes.length} nodes):` : `Community ${cid} (${nodes.length} nodes):`];
5892
6399
  for (const n of nodes) {
5893
6400
  const d = G.getNodeAttributes(n);
5894
6401
  lines.push(` ${d.label ?? n} [${d.source_file ?? ""}]`);
@@ -6110,8 +6617,13 @@ async function serve(graphPath = "graphify-out/graph.json", transport) {
6110
6617
  if (!handler) {
6111
6618
  return { content: [{ type: "text", text: `Unknown tool: ${name}` }] };
6112
6619
  }
6113
- const text = handler(args ?? {});
6114
- return { content: [{ type: "text", text }] };
6620
+ try {
6621
+ const text = handler(args ?? {});
6622
+ return { content: [{ type: "text", text }] };
6623
+ } catch (err) {
6624
+ const message = err instanceof Error ? err.message : String(err);
6625
+ return { content: [{ type: "text", text: `Error executing ${name}: ${message}` }] };
6626
+ }
6115
6627
  });
6116
6628
  const serverTransport = transport ?? new StdioServerTransport();
6117
6629
  let keepAlive;
@@ -6119,14 +6631,14 @@ async function serve(graphPath = "graphify-out/graph.json", transport) {
6119
6631
  keepAlive = setInterval(() => void 0, 6e4);
6120
6632
  process.stdin?.resume();
6121
6633
  }
6122
- const closed = new Promise((resolve5) => {
6634
+ const closed = new Promise((resolve8) => {
6123
6635
  const previousOnClose = server.onclose;
6124
6636
  server.onclose = () => {
6125
6637
  if (keepAlive) {
6126
6638
  clearInterval(keepAlive);
6127
6639
  }
6128
6640
  previousOnClose?.();
6129
- resolve5();
6641
+ resolve8();
6130
6642
  };
6131
6643
  });
6132
6644
  await server.connect(serverTransport);
@@ -6134,7 +6646,7 @@ async function serve(graphPath = "graphify-out/graph.json", transport) {
6134
6646
  await closed;
6135
6647
  }
6136
6648
  }
6137
- var isDirectExecution2 = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^serve\.(?:js|mjs|cjs|ts)$/.test((0, import_node_path7.basename)(process.argv[1]));
6649
+ var isDirectExecution2 = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^serve\.(?:js|mjs|cjs|ts)$/.test((0, import_node_path8.basename)(process.argv[1]));
6138
6650
  if (isDirectExecution2) {
6139
6651
  const graphPath = process.argv[2] ?? "graphify-out/graph.json";
6140
6652
  serve(graphPath).catch((err) => {
@@ -6144,59 +6656,14 @@ if (isDirectExecution2) {
6144
6656
  }
6145
6657
 
6146
6658
  // src/watch.ts
6147
- var import_node_fs10 = require("fs");
6148
- var import_node_path8 = require("path");
6659
+ var import_node_fs11 = require("fs");
6660
+ var import_node_path9 = require("path");
6661
+ init_detect();
6149
6662
  var WATCHED_EXTENSIONS = /* @__PURE__ */ new Set([
6150
- ".py",
6151
- ".ts",
6152
- ".js",
6153
- ".go",
6154
- ".rs",
6155
- ".java",
6156
- ".cpp",
6157
- ".c",
6158
- ".rb",
6159
- ".swift",
6160
- ".kt",
6161
- ".cs",
6162
- ".scala",
6163
- ".php",
6164
- ".cc",
6165
- ".cxx",
6166
- ".hpp",
6167
- ".h",
6168
- ".kts",
6169
- ".md",
6170
- ".txt",
6171
- ".rst",
6172
- ".pdf",
6173
- ".png",
6174
- ".jpg",
6175
- ".jpeg",
6176
- ".webp",
6177
- ".gif",
6178
- ".svg"
6179
- ]);
6180
- var CODE_EXTENSIONS3 = /* @__PURE__ */ new Set([
6181
- ".py",
6182
- ".ts",
6183
- ".js",
6184
- ".go",
6185
- ".rs",
6186
- ".java",
6187
- ".cpp",
6188
- ".c",
6189
- ".rb",
6190
- ".swift",
6191
- ".kt",
6192
- ".cs",
6193
- ".scala",
6194
- ".php",
6195
- ".cc",
6196
- ".cxx",
6197
- ".hpp",
6198
- ".h",
6199
- ".kts"
6663
+ ...CODE_EXTENSIONS,
6664
+ ...DOC_EXTENSIONS,
6665
+ ...PAPER_EXTENSIONS,
6666
+ ...IMAGE_EXTENSIONS
6200
6667
  ]);
6201
6668
  async function rebuildCode(watchPath, followSymlinks = false) {
6202
6669
  try {
@@ -6248,8 +6715,8 @@ async function rebuildCode(watchPath, followSymlinks = false) {
6248
6715
  labels.set(cid, `Community ${cid}`);
6249
6716
  }
6250
6717
  const questions = suggestQuestions2(G, communities, labels);
6251
- const outDir = (0, import_node_path8.resolve)(watchPath, "graphify-out");
6252
- (0, import_node_fs10.mkdirSync)(outDir, { recursive: true });
6718
+ const outDir = (0, import_node_path9.resolve)(watchPath, "graphify-out");
6719
+ (0, import_node_fs11.mkdirSync)(outDir, { recursive: true });
6253
6720
  const report = generate2(
6254
6721
  G,
6255
6722
  communities,
@@ -6262,11 +6729,11 @@ async function rebuildCode(watchPath, followSymlinks = false) {
6262
6729
  watchPath,
6263
6730
  questions
6264
6731
  );
6265
- (0, import_node_fs10.writeFileSync)((0, import_node_path8.resolve)(outDir, "GRAPH_REPORT.md"), report, "utf-8");
6266
- toJson2(G, communities, (0, import_node_path8.resolve)(outDir, "graph.json"));
6267
- const flagPath = (0, import_node_path8.resolve)(outDir, "needs_update");
6268
- if ((0, import_node_fs10.existsSync)(flagPath)) {
6269
- (0, import_node_fs10.unlinkSync)(flagPath);
6732
+ (0, import_node_fs11.writeFileSync)((0, import_node_path9.resolve)(outDir, "GRAPH_REPORT.md"), report, "utf-8");
6733
+ toJson2(G, communities, (0, import_node_path9.resolve)(outDir, "graph.json"), { communityLabels: labels });
6734
+ const flagPath = (0, import_node_path9.resolve)(outDir, "needs_update");
6735
+ if ((0, import_node_fs11.existsSync)(flagPath)) {
6736
+ (0, import_node_fs11.unlinkSync)(flagPath);
6270
6737
  }
6271
6738
  console.log(
6272
6739
  `[graphify watch] Rebuilt: ${G.order} nodes, ${G.size} edges, ${communities.size} communities`
@@ -6283,10 +6750,10 @@ async function rebuildCode(watchPath, followSymlinks = false) {
6283
6750
  }
6284
6751
  }
6285
6752
  function notifyOnly(watchPath) {
6286
- const outDir = (0, import_node_path8.resolve)(watchPath, "graphify-out");
6287
- (0, import_node_fs10.mkdirSync)(outDir, { recursive: true });
6288
- const flagPath = (0, import_node_path8.resolve)(outDir, "needs_update");
6289
- (0, import_node_fs10.writeFileSync)(flagPath, "1", "utf-8");
6753
+ const outDir = (0, import_node_path9.resolve)(watchPath, "graphify-out");
6754
+ (0, import_node_fs11.mkdirSync)(outDir, { recursive: true });
6755
+ const flagPath = (0, import_node_path9.resolve)(outDir, "needs_update");
6756
+ (0, import_node_fs11.writeFileSync)(flagPath, "1", "utf-8");
6290
6757
  console.log(`
6291
6758
  [graphify watch] New or changed files detected in ${watchPath}`);
6292
6759
  console.log(
@@ -6298,7 +6765,7 @@ function notifyOnly(watchPath) {
6298
6765
  console.log(`[graphify watch] Flag written to ${flagPath}`);
6299
6766
  }
6300
6767
  function hasNonCode(changedPaths) {
6301
- return changedPaths.some((p) => !CODE_EXTENSIONS3.has((0, import_node_path8.extname)(p).toLowerCase()));
6768
+ return changedPaths.some((p) => !CODE_EXTENSIONS.has((0, import_node_path9.extname)(p).toLowerCase()));
6302
6769
  }
6303
6770
  async function watch(watchPath, debounce = 3) {
6304
6771
  let chokidar;
@@ -6307,7 +6774,7 @@ async function watch(watchPath, debounce = 3) {
6307
6774
  } catch {
6308
6775
  throw new Error("chokidar not installed. Run: npm install chokidar");
6309
6776
  }
6310
- const resolvedPath = (0, import_node_path8.resolve)(watchPath);
6777
+ const resolvedPath = (0, import_node_path9.resolve)(watchPath);
6311
6778
  let lastTrigger = 0;
6312
6779
  let pending = false;
6313
6780
  const changed = /* @__PURE__ */ new Set();
@@ -6322,7 +6789,7 @@ async function watch(watchPath, debounce = 3) {
6322
6789
  ]
6323
6790
  });
6324
6791
  watcher.on("all", (_event, filePath) => {
6325
- const ext = (0, import_node_path8.extname)(filePath).toLowerCase();
6792
+ const ext = (0, import_node_path9.extname)(filePath).toLowerCase();
6326
6793
  if (!WATCHED_EXTENSIONS.has(ext)) return;
6327
6794
  const parts = filePath.split("/");
6328
6795
  if (parts.some((part) => part.startsWith(".") && part !== ".")) return;
@@ -6361,7 +6828,7 @@ async function watch(watchPath, debounce = 3) {
6361
6828
  process.on("SIGINT", cleanup);
6362
6829
  process.on("SIGTERM", cleanup);
6363
6830
  }
6364
- var isDirectExecution3 = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^watch\.(?:js|mjs|cjs|ts)$/.test((0, import_node_path8.basename)(process.argv[1]));
6831
+ var isDirectExecution3 = typeof process !== "undefined" && typeof process.argv[1] === "string" && /^watch\.(?:js|mjs|cjs|ts)$/.test((0, import_node_path9.basename)(process.argv[1]));
6365
6832
  if (isDirectExecution3) {
6366
6833
  const watchPath = process.argv[2] ?? ".";
6367
6834
  const debounce = process.argv[3] ? parseFloat(process.argv[3]) : 3;
@@ -6374,8 +6841,10 @@ if (isDirectExecution3) {
6374
6841
  0 && (module.exports = {
6375
6842
  FileType,
6376
6843
  assertValid,
6844
+ augmentDetectionWithTranscripts,
6377
6845
  build,
6378
6846
  buildFromJson,
6847
+ buildWhisperPrompt,
6379
6848
  checkSemanticCache,
6380
6849
  classifyFile,
6381
6850
  cluster,
@@ -6383,6 +6852,7 @@ if (isDirectExecution3) {
6383
6852
  collectFiles,
6384
6853
  detect,
6385
6854
  detectIncremental,
6855
+ downloadAudio,
6386
6856
  extract,
6387
6857
  fileHash,
6388
6858
  generateReport,
@@ -6412,6 +6882,8 @@ if (isDirectExecution3) {
6412
6882
  toJson,
6413
6883
  toSvg,
6414
6884
  toWiki,
6885
+ transcribe,
6886
+ transcribeAll,
6415
6887
  validateExtraction,
6416
6888
  validateGraphPath,
6417
6889
  validateUrl,