opencode-diane 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +180 -0
  2. package/LICENSE +21 -0
  3. package/README.md +206 -0
  4. package/WIKI.md +1430 -0
  5. package/dist/index.d.ts +28 -0
  6. package/dist/index.js +1632 -0
  7. package/dist/ingest/adaptive.d.ts +47 -0
  8. package/dist/ingest/adaptive.js +182 -0
  9. package/dist/ingest/code-health.d.ts +58 -0
  10. package/dist/ingest/code-health.js +202 -0
  11. package/dist/ingest/code-map.d.ts +71 -0
  12. package/dist/ingest/code-map.js +670 -0
  13. package/dist/ingest/cross-refs.d.ts +59 -0
  14. package/dist/ingest/cross-refs.js +1207 -0
  15. package/dist/ingest/docs.d.ts +49 -0
  16. package/dist/ingest/docs.js +325 -0
  17. package/dist/ingest/git.d.ts +77 -0
  18. package/dist/ingest/git.js +390 -0
  19. package/dist/ingest/live-session.d.ts +101 -0
  20. package/dist/ingest/live-session.js +173 -0
  21. package/dist/ingest/project-notes.d.ts +28 -0
  22. package/dist/ingest/project-notes.js +102 -0
  23. package/dist/ingest/project.d.ts +35 -0
  24. package/dist/ingest/project.js +430 -0
  25. package/dist/ingest/session-snapshot.d.ts +63 -0
  26. package/dist/ingest/session-snapshot.js +94 -0
  27. package/dist/ingest/sessions.d.ts +29 -0
  28. package/dist/ingest/sessions.js +164 -0
  29. package/dist/ingest/tables.d.ts +52 -0
  30. package/dist/ingest/tables.js +360 -0
  31. package/dist/mining/skill-miner.d.ts +53 -0
  32. package/dist/mining/skill-miner.js +234 -0
  33. package/dist/search/bm25.d.ts +81 -0
  34. package/dist/search/bm25.js +334 -0
  35. package/dist/search/e5-embedder.d.ts +30 -0
  36. package/dist/search/e5-embedder.js +91 -0
  37. package/dist/search/embed-pass.d.ts +26 -0
  38. package/dist/search/embed-pass.js +43 -0
  39. package/dist/search/embedder.d.ts +58 -0
  40. package/dist/search/embedder.js +85 -0
  41. package/dist/search/inverted-index.d.ts +51 -0
  42. package/dist/search/inverted-index.js +139 -0
  43. package/dist/search/ppr.d.ts +44 -0
  44. package/dist/search/ppr.js +118 -0
  45. package/dist/search/tokenize.d.ts +26 -0
  46. package/dist/search/tokenize.js +98 -0
  47. package/dist/store/eviction.d.ts +16 -0
  48. package/dist/store/eviction.js +37 -0
  49. package/dist/store/repository.d.ts +222 -0
  50. package/dist/store/repository.js +420 -0
  51. package/dist/store/sqlite-store.d.ts +89 -0
  52. package/dist/store/sqlite-store.js +252 -0
  53. package/dist/store/vector-store.d.ts +66 -0
  54. package/dist/store/vector-store.js +160 -0
  55. package/dist/types.d.ts +385 -0
  56. package/dist/types.js +9 -0
  57. package/dist/utils/file-log.d.ts +87 -0
  58. package/dist/utils/file-log.js +215 -0
  59. package/dist/utils/peer-detection.d.ts +45 -0
  60. package/dist/utils/peer-detection.js +90 -0
  61. package/dist/utils/shell.d.ts +43 -0
  62. package/dist/utils/shell.js +110 -0
  63. package/dist/utils/usage-skill.d.ts +42 -0
  64. package/dist/utils/usage-skill.js +129 -0
  65. package/dist/utils/xlsx.d.ts +36 -0
  66. package/dist/utils/xlsx.js +270 -0
  67. package/grammars/tree-sitter-c.wasm +0 -0
  68. package/grammars/tree-sitter-c_sharp.wasm +0 -0
  69. package/grammars/tree-sitter-cpp.wasm +0 -0
  70. package/grammars/tree-sitter-css.wasm +0 -0
  71. package/grammars/tree-sitter-go.wasm +0 -0
  72. package/grammars/tree-sitter-html.wasm +0 -0
  73. package/grammars/tree-sitter-java.wasm +0 -0
  74. package/grammars/tree-sitter-javascript.wasm +0 -0
  75. package/grammars/tree-sitter-json.wasm +0 -0
  76. package/grammars/tree-sitter-php.wasm +0 -0
  77. package/grammars/tree-sitter-python.wasm +0 -0
  78. package/grammars/tree-sitter-rust.wasm +0 -0
  79. package/grammars/tree-sitter-typescript.wasm +0 -0
  80. package/package.json +80 -0
@@ -0,0 +1,1207 @@
1
+ /**
2
+ * cross-refs.ts — grammar-agnostic edge discovery between files.
3
+ *
4
+ * The hard part isn't finding connections; the hard part is finding
5
+ * them with FEW false positives. Single-signal heuristics (regex
6
+ * "import" detection, free-text identifier mentions) all have
7
+ * meaningful FP rates that mislead the agent's navigation. This
8
+ * ingester uses MULTI-SIGNAL CORROBORATION:
9
+ *
10
+ * - Filesystem-grounded signals (path-resolves-to-existing-file)
11
+ * emit edges alone — the disk grounds them.
12
+ * - Lexical signals (identifier mention) only emit edges when
13
+ * corroborated by a SECOND orthogonal signal (rarity + import-
14
+ * line context, OR rarity + filename-class coupling).
15
+ *
16
+ * Four passes:
17
+ *
18
+ * 1. Definition extraction (language-keyed regex). Builds a map
19
+ * `identifier → Set<defining-file>`.
20
+ * 2. Import-path resolution. For each file, find import-like lines,
21
+ * try to resolve the named module/path to an actual file under
22
+ * the project root. Existing → edge.
23
+ * 3. Config path-strings. For .json/.yaml/.toml, walk string values
24
+ * and try to resolve them as relative file paths. Existing → edge.
25
+ * 4. Corroborated identifier mentions. Tokenise each file line by
26
+ * line; for tokens that are defined elsewhere AND rarity-gated,
27
+ * emit an edge only when *also* corroborated by either an
28
+ * import-line context or filename-class coupling.
29
+ *
30
+ * Edges with the same (source, target) pair are merged; the evidence
31
+ * list grows but only one memory is written per edge.
32
+ *
33
+ * **What this catches that code-map doesn't:** any cross-file
34
+ * connection in a language tree-sitter doesn't have a grammar for
35
+ * (Ruby, Pascal, Perl, Lua, Elixir, Erlang, Tcl, Nim, Zig, Swift,
36
+ * Kotlin, Scala, Haskell, OCaml, F#, Clojure, Pascal, VB, …), plus
37
+ * config-style cross-references in JSON/YAML/TOML files.
38
+ *
39
+ * **What it doesn't catch:** dynamic loads (`require(varName)`),
40
+ * reflection-based dispatch, anything where the connection only
41
+ * exists at runtime. Those are out of scope for any static technique.
42
+ */
43
+ import { readdir, readFile, stat } from "node:fs/promises";
44
+ import { join, relative, sep, extname, dirname, basename } from "node:path";
45
+ const CATEGORY = "project-facts";
46
+ const SKIP_DIRS = new Set([
47
+ "node_modules",
48
+ ".git",
49
+ "dist",
50
+ "build",
51
+ "out",
52
+ "target",
53
+ ".next",
54
+ "coverage",
55
+ ".cache",
56
+ "vendor",
57
+ "tmp",
58
+ "__pycache__",
59
+ ]);
60
+ /* ── caps ──────────────────────────────────────────────────────────── */
61
+ const MAX_FILES = 2000;
62
+ const MAX_FILE_BYTES = 256 * 1024;
63
+ const MAX_DEFS_PER_FILE = 200;
64
+ const MAX_TOKENS_PER_FILE = 50_000; // tokenisation guard for huge files
65
+ const MAX_EDGES_TOTAL = 10_000;
66
+ /* ── language-keyed definition patterns ────────────────────────────── */
67
+ /*
68
+ * Each pattern uses /m (multiline ^/$) with one capture group: the
69
+ * identifier name. `g` so we can iterate matches with matchAll.
70
+ *
71
+ * Coverage notes per language. The list is curated by what we observe
72
+ * in real code, not what a language grammar would consider complete.
73
+ */
74
+ const DEFINITION_PATTERNS = {
75
+ // Ruby — class, module, def. `self.` prefix for class methods kept
76
+ // out of the capture so `UserService.create` mentions match.
77
+ ".rb": [
78
+ /^\s*class\s+([A-Z][\w:]*)\b/gm,
79
+ /^\s*module\s+([A-Z][\w:]*)\b/gm,
80
+ /^\s*def\s+(?:self\.)?([\w?!]+)/gm,
81
+ ],
82
+ // Pascal / Object Pascal / Delphi. Case-insensitive — Pascal is
83
+ // historically case-insensitive and real code mixes `Procedure`,
84
+ // `procedure`, `PROCEDURE`.
85
+ ".pas": [
86
+ /^\s*unit\s+(\w+)\s*;/gim,
87
+ /^\s*procedure\s+(\w+)/gim,
88
+ /^\s*function\s+(\w+)/gim,
89
+ /\btype\s+(\w+)\s*=\s*(?:class|record|interface|object)\b/gim,
90
+ ],
91
+ ".pp": [
92
+ /^\s*unit\s+(\w+)\s*;/gim,
93
+ /^\s*procedure\s+(\w+)/gim,
94
+ /^\s*function\s+(\w+)/gim,
95
+ /\btype\s+(\w+)\s*=\s*(?:class|record|interface|object)\b/gim,
96
+ ],
97
+ ".dpr": [/^\s*program\s+(\w+)\s*;/gim],
98
+ // Perl — package, sub. Package names can contain `::` (kept in capture).
99
+ ".pl": [
100
+ /^\s*package\s+([\w:]+)\s*;/gm,
101
+ /^\s*sub\s+(\w+)/gm,
102
+ ],
103
+ ".pm": [
104
+ /^\s*package\s+([\w:]+)\s*;/gm,
105
+ /^\s*sub\s+(\w+)/gm,
106
+ ],
107
+ // Lua — function, method, local function. `function Module.name` and
108
+ // `function Module:name` both expose `name`.
109
+ ".lua": [
110
+ /^\s*(?:local\s+)?function\s+(\w+)/gm,
111
+ /^\s*function\s+\w+[.:](\w+)/gm,
112
+ /^\s*(\w+)\s*=\s*function\b/gm,
113
+ ],
114
+ // Elixir — defmodule (dotted), def/defp/defmacro.
115
+ ".ex": [
116
+ /^\s*defmodule\s+([\w.]+)/gm,
117
+ /^\s*defp?\s+(\w+)/gm,
118
+ /^\s*defmacrop?\s+(\w+)/gm,
119
+ ],
120
+ ".exs": [
121
+ /^\s*defmodule\s+([\w.]+)/gm,
122
+ /^\s*defp?\s+(\w+)/gm,
123
+ ],
124
+ // Erlang — module declaration, exported functions.
125
+ ".erl": [
126
+ /^-module\(([\w_]+)\)/gm,
127
+ /^([a-z]\w*)\s*\([^)]*\)\s*->/gm, // function clause
128
+ ],
129
+ // Swift, Kotlin, Scala, Dart, Zig, Nim — class/func/struct + a few.
130
+ ".swift": [
131
+ /^\s*(?:public\s+|private\s+|internal\s+|fileprivate\s+|open\s+)?(?:class|struct|enum|protocol)\s+([A-Z]\w*)\b/gm,
132
+ /^\s*(?:public\s+|private\s+|internal\s+)?func\s+(\w+)/gm,
133
+ ],
134
+ ".kt": [
135
+ /^\s*(?:public\s+|private\s+|internal\s+|protected\s+|open\s+|abstract\s+)*(?:class|interface|object|enum)\s+([A-Z]\w*)\b/gm,
136
+ /^\s*(?:public\s+|private\s+|internal\s+)?fun\s+(\w+)/gm,
137
+ ],
138
+ ".kts": [
139
+ /^\s*(?:public\s+|private\s+|internal\s+)*(?:class|interface|object)\s+([A-Z]\w*)\b/gm,
140
+ /^\s*(?:public\s+|private\s+|internal\s+)?fun\s+(\w+)/gm,
141
+ ],
142
+ ".scala": [
143
+ /^\s*(?:abstract\s+)?(?:class|trait|object|enum)\s+([A-Z]\w*)\b/gm,
144
+ /^\s*def\s+(\w+)/gm,
145
+ ],
146
+ ".dart": [
147
+ /^\s*(?:abstract\s+)?class\s+([A-Z]\w*)\b/gm,
148
+ /^\s*(?:[A-Z]\w*\s+)?(\w+)\s*\([^)]*\)\s*(?:async\s*)?(?:=>|\{)/gm,
149
+ ],
150
+ ".zig": [
151
+ /^\s*(?:pub\s+)?fn\s+(\w+)/gm,
152
+ /^\s*(?:pub\s+)?const\s+([A-Z]\w*)\s*=\s*struct\b/gm,
153
+ ],
154
+ ".nim": [
155
+ /^\s*proc\s+(\w+)/gm,
156
+ /^\s*type\s+(\w+)\b/gm,
157
+ ],
158
+ // OCaml / F# / Haskell / Clojure — module-style declarations.
159
+ ".ml": [
160
+ /^\s*module\s+([A-Z]\w*)\b/gm,
161
+ /^\s*let\s+(\w+)/gm,
162
+ ],
163
+ ".mli": [/^\s*module\s+([A-Z]\w*)\b/gm, /^\s*val\s+(\w+)/gm],
164
+ ".fs": [
165
+ /^\s*module\s+([A-Z]\w*)\b/gm,
166
+ /^\s*let\s+(\w+)/gm,
167
+ /^\s*type\s+([A-Z]\w*)\b/gm,
168
+ ],
169
+ ".hs": [
170
+ /^\s*module\s+([A-Z][\w.]*)\s+(?:where|\()/gm,
171
+ /^([a-z]\w*)\s*::/gm, // top-level type signature
172
+ ],
173
+ ".clj": [/^\s*\(ns\s+([\w.-]+)/gm, /^\s*\(defn-?\s+([\w?!-]+)/gm],
174
+ // Visual Basic, Tcl, R — for completeness.
175
+ ".vb": [
176
+ /^\s*(?:Public\s+|Private\s+|Friend\s+)?(?:Class|Module|Interface|Structure)\s+(\w+)/gim,
177
+ /^\s*(?:Public\s+|Private\s+)?(?:Sub|Function)\s+(\w+)/gim,
178
+ ],
179
+ ".tcl": [/^\s*proc\s+(\w+)/gm],
180
+ ".r": [/^\s*(\w+)\s*<-\s*function\b/gm],
181
+ // Shell scripts — function definitions.
182
+ ".sh": [
183
+ /^\s*(?:function\s+)?(\w+)\s*\(\s*\)\s*\{/gm,
184
+ ],
185
+ ".bash": [
186
+ /^\s*(?:function\s+)?(\w+)\s*\(\s*\)\s*\{/gm,
187
+ ],
188
+ // ── Crystal — Ruby-family with explicit struct keyword ───────────
189
+ // Same surface as Ruby but adds `struct` and supports `def self.x`.
190
+ // We capture leaf names; the rarity gate filters out generic ones.
191
+ ".cr": [
192
+ /^\s*class\s+([A-Z]\w*)/gm,
193
+ /^\s*struct\s+([A-Z]\w*)/gm,
194
+ /^\s*module\s+([A-Z]\w*)/gm,
195
+ /^\s*def\s+(?:self\.)?(\w+[!?=]?)/gm,
196
+ ],
197
+ // ── Julia — function / module / struct (immutable + mutable) ─────
198
+ ".jl": [
199
+ /^\s*function\s+(\w+)/gm,
200
+ /^\s*module\s+([A-Z]\w*)/gm,
201
+ /^\s*(?:mutable\s+)?struct\s+([A-Z]\w*)/gm,
202
+ /^\s*abstract\s+type\s+([A-Z]\w*)/gm,
203
+ ],
204
+ // ── GraphQL — schema types, all the kinds the spec defines ───────
205
+ // SDL is line-oriented for declarations: each definition starts at
206
+ // column 0 with the kind keyword. We cover all six declaration
207
+ // forms. The DSL is a true definition language — types reference
208
+ // each other in field positions (`field: OtherType`), which our
209
+ // mention-based pass picks up downstream.
210
+ ".graphql": [
211
+ /^\s*type\s+([A-Z]\w*)/gm,
212
+ /^\s*input\s+([A-Z]\w*)/gm,
213
+ /^\s*interface\s+([A-Z]\w*)/gm,
214
+ /^\s*enum\s+([A-Z]\w*)/gm,
215
+ /^\s*union\s+([A-Z]\w*)\s*=/gm,
216
+ /^\s*scalar\s+([A-Z]\w*)/gm,
217
+ ],
218
+ ".gql": [
219
+ /^\s*type\s+([A-Z]\w*)/gm,
220
+ /^\s*input\s+([A-Z]\w*)/gm,
221
+ /^\s*interface\s+([A-Z]\w*)/gm,
222
+ /^\s*enum\s+([A-Z]\w*)/gm,
223
+ /^\s*union\s+([A-Z]\w*)\s*=/gm,
224
+ /^\s*scalar\s+([A-Z]\w*)/gm,
225
+ ],
226
+ // ── Protocol Buffers — message / service / enum + import paths ───
227
+ // Proto files have `import "other.proto";` lines that the path-
228
+ // resolved-string pass already picks up. Definitions here let the
229
+ // mention-based pass connect proto files that share types.
230
+ ".proto": [
231
+ /^\s*message\s+([A-Z]\w*)/gm,
232
+ /^\s*service\s+([A-Z]\w*)/gm,
233
+ /^\s*enum\s+([A-Z]\w*)/gm,
234
+ ],
235
+ // ── Thrift — struct / service / exception / enum ─────────────────
236
+ // Like proto but with exception types. Has `include "other.thrift"`
237
+ // import statements that the path pass picks up.
238
+ ".thrift": [
239
+ /^\s*struct\s+([A-Z]\w*)/gm,
240
+ /^\s*service\s+([A-Z]\w*)/gm,
241
+ /^\s*enum\s+([A-Z]\w*)/gm,
242
+ /^\s*exception\s+([A-Z]\w*)/gm,
243
+ /^\s*union\s+([A-Z]\w*)/gm,
244
+ ],
245
+ // ── Verilog / SystemVerilog ──────────────────────────────────────
246
+ // Verilog modules are lowercase by convention; can't reuse the
247
+ // capital-letter `module` pattern from Ruby/OCaml. Anchored by the
248
+ // trailing `#`, `(`, or `;` that follows a real module header.
249
+ ".v": [
250
+ /^\s*module\s+(\w+)\s*[#(;]/gm,
251
+ ],
252
+ ".sv": [
253
+ /^\s*module\s+(\w+)\s*[#(;]/gm,
254
+ /^\s*(?:virtual\s+)?class\s+(\w+)\s*[#(;:]/gm,
255
+ /^\s*interface\s+(\w+)\s*[#(;]/gm,
256
+ /^\s*package\s+(\w+)\s*;/gm,
257
+ ],
258
+ ".vh": [/^\s*module\s+(\w+)\s*[#(;]/gm],
259
+ ".svh": [
260
+ /^\s*(?:virtual\s+)?class\s+(\w+)\s*[#(;:]/gm,
261
+ /^\s*interface\s+(\w+)\s*[#(;]/gm,
262
+ /^\s*package\s+(\w+)\s*;/gm,
263
+ ],
264
+ // ── VHDL ─────────────────────────────────────────────────────────
265
+ // VHDL is case-insensitive (canonically uppercase keywords). The
266
+ // `is`/`of` anchor at the end pins the definition shape and keeps
267
+ // these from matching Ruby/JS `entity` mentions etc.
268
+ ".vhd": [
269
+ /^\s*entity\s+(\w+)\s+is\b/gim,
270
+ /^\s*architecture\s+(\w+)\s+of\s+\w+\s+is\b/gim,
271
+ /^\s*package\s+(\w+)\s+is\b/gim,
272
+ /^\s*configuration\s+(\w+)\s+of\b/gim,
273
+ ],
274
+ ".vhdl": [
275
+ /^\s*entity\s+(\w+)\s+is\b/gim,
276
+ /^\s*architecture\s+(\w+)\s+of\s+\w+\s+is\b/gim,
277
+ /^\s*package\s+(\w+)\s+is\b/gim,
278
+ /^\s*configuration\s+(\w+)\s+of\b/gim,
279
+ ],
280
+ // ── COBOL ────────────────────────────────────────────────────────
281
+ // PROGRAM-ID is the canonical identifier of a COBOL program;
282
+ // section / paragraph names are too noisy to capture wholesale.
283
+ // The `\.` terminator after PROGRAM-ID is required by the
284
+ // grammar — keeps the pattern from matching prose.
285
+ ".cob": [/^\s*PROGRAM-ID\.\s+([\w-]+)\s*\.?/gim],
286
+ ".cbl": [/^\s*PROGRAM-ID\.\s+([\w-]+)\s*\.?/gim],
287
+ ".cpy": [/^\s*PROGRAM-ID\.\s+([\w-]+)\s*\.?/gim],
288
+ // ── Fortran (modern: free-form .f90+) ────────────────────────────
289
+ // Fixed-form .f / .for is column-sensitive and not worth the
290
+ // complexity. Modern Fortran is line-anchored and well-served by
291
+ // these patterns.
292
+ ".f90": [
293
+ /^\s*subroutine\s+(\w+)/gim,
294
+ /^\s*(?:[\w\s(),:*]+?\s+)?function\s+(\w+)\s*\(/gim,
295
+ /^\s*module\s+(\w+)/gim,
296
+ /^\s*program\s+(\w+)/gim,
297
+ ],
298
+ ".f95": [
299
+ /^\s*subroutine\s+(\w+)/gim,
300
+ /^\s*(?:[\w\s(),:*]+?\s+)?function\s+(\w+)\s*\(/gim,
301
+ /^\s*module\s+(\w+)/gim,
302
+ /^\s*program\s+(\w+)/gim,
303
+ ],
304
+ ".f03": [
305
+ /^\s*subroutine\s+(\w+)/gim,
306
+ /^\s*module\s+(\w+)/gim,
307
+ ],
308
+ ".f08": [
309
+ /^\s*subroutine\s+(\w+)/gim,
310
+ /^\s*module\s+(\w+)/gim,
311
+ ],
312
+ // ── Solidity ─────────────────────────────────────────────────────
313
+ // `contract` / `interface` / `library` are the three top-level
314
+ // declaration kinds. Existing `import "..."` pattern already covers
315
+ // Solidity imports; no new import pattern needed.
316
+ ".sol": [
317
+ /^\s*(?:abstract\s+)?contract\s+([A-Z]\w*)/gm,
318
+ /^\s*interface\s+([A-Z]\w*)/gm,
319
+ /^\s*library\s+([A-Z]\w*)/gm,
320
+ /^\s*function\s+(\w+)\s*\(/gm,
321
+ ],
322
+ // ── Vim script ───────────────────────────────────────────────────
323
+ // `function!` is the redefining form, `function` the strict one;
324
+ // both name a function. Optional scope prefix (`s:`, `g:`, `b:`,
325
+ // `w:`, `t:`) is stripped from the capture so the name is the leaf
326
+ // — matches how a `:call MyFunc()` reference appears.
327
+ ".vim": [/^\s*function!?\s+(?:[sgbwt]:)?(\w+)/gm],
328
+ // ── D ───────────────────────────────────────────────────────────
329
+ // D is C-family with a distinct `module` declaration. The Java/C++
330
+ // class/struct/interface patterns under GENERIC handle the rest;
331
+ // here we just add the module decl so `module pkg.thing;` files
332
+ // are picked up. D's `import` is covered by the existing JS-style
333
+ // import pattern.
334
+ ".d": [
335
+ /^\s*module\s+([\w.]+)\s*;/gm,
336
+ /^\s*(?:public\s+|private\s+)?(?:class|struct|interface|enum|template)\s+([A-Z]\w*)/gm,
337
+ ],
338
+ // ── Smalltalk ────────────────────────────────────────────────────
339
+ // The canonical "class A subclass: #B" form. Other Smalltalk
340
+ // dialects (Pharo class definitions across multiple lines) are
341
+ // out of scope for a regex pass — too easy to FP. This single
342
+ // pattern is high-precision.
343
+ ".st": [/^[A-Z][\w]*\s+subclass:\s*#(\w+)/gm],
344
+ // ── Racket / Scheme / Common Lisp ────────────────────────────────
345
+ // Parenthesised forms; we anchor at start of an open paren on the
346
+ // line. Lisp identifiers allow many extra chars (`!`, `?`, `+`,
347
+ // `-`, `*`, `/`, `=`, `<`, `>`). Multi-line definitions are common
348
+ // but the form keyword + name is on the first line — that's what
349
+ // we capture.
350
+ ".rkt": [
351
+ /^\s*\(define(?:-struct)?\s+\(?([\w!?+\-*/=<>]+)/gm,
352
+ /^\s*\(provide\s+([\w!?+\-*/=<>]+)/gm,
353
+ ],
354
+ ".scm": [/^\s*\(define\s+\(?([\w!?+\-*/=<>]+)/gm],
355
+ ".ss": [/^\s*\(define\s+\(?([\w!?+\-*/=<>]+)/gm],
356
+ ".lisp": [
357
+ /^\s*\(defun\s+([\w!?+\-*/=<>]+)/gm,
358
+ /^\s*\(defmacro\s+([\w!?+\-*/=<>]+)/gm,
359
+ /^\s*\(defclass\s+([\w!?+\-*/=<>]+)/gm,
360
+ /^\s*\(defstruct\s+([\w!?+\-*/=<>]+)/gm,
361
+ /^\s*\(defpackage\s+:?([\w!?+\-*/=<>]+)/gm,
362
+ ],
363
+ ".cl": [
364
+ /^\s*\(defun\s+([\w!?+\-*/=<>]+)/gm,
365
+ /^\s*\(defclass\s+([\w!?+\-*/=<>]+)/gm,
366
+ ],
367
+ // ── Modula-2 ─────────────────────────────────────────────────────
368
+ // Modula-2 keywords are uppercase by convention. The MODULE
369
+ // declaration is the canonical identifier; PROCEDUREs inside are
370
+ // captured too. Niche but the user explicitly asked for languages
371
+ // without reliable tree-sitter grammars.
372
+ ".mod": [
373
+ /^\s*MODULE\s+(\w+)\s*;/gm,
374
+ /^\s*PROCEDURE\s+(\w+)/gm,
375
+ ],
376
+ ".m2": [
377
+ /^\s*MODULE\s+(\w+)\s*;/gm,
378
+ /^\s*PROCEDURE\s+(\w+)/gm,
379
+ ],
380
+ // ── Ada ──────────────────────────────────────────────────────────
381
+ // `package Foo is` and `procedure Foo is`. Existing `with Foo;`
382
+ // import line pattern would cover Ada imports if we added one;
383
+ // skipped for now — Ada is rare enough that demand will surface
384
+ // the need before we speculate.
385
+ ".adb": [
386
+ /^\s*package(?:\s+body)?\s+([\w.]+)\s+is\b/gim,
387
+ /^\s*procedure\s+([\w.]+)/gim,
388
+ ],
389
+ ".ads": [
390
+ /^\s*package(?:\s+body)?\s+([\w.]+)\s+is\b/gim,
391
+ /^\s*procedure\s+([\w.]+)/gim,
392
+ /^\s*function\s+(\w+)/gim,
393
+ ],
394
+ };
395
+ /** Languages we DON'T have a dedicated pattern set for: try a small
396
+ * generic set so we still get something. Conservative — only the
397
+ * most universally-marked forms. */
398
+ const GENERIC_DEFINITION_PATTERNS = [
399
+ /^\s*(?:public\s+|private\s+|protected\s+|abstract\s+|static\s+)*(?:class|interface|trait|enum|struct)\s+([A-Z]\w*)\b/gm,
400
+ /^\s*(?:def|func|fn|function|defn|defmodule)\s+(\w+)/gm,
401
+ ];
402
+ /* ── import-line patterns ──────────────────────────────────────────── */
403
+ /*
404
+ * Each pattern captures a string that names a module or file path the
405
+ * importing file depends on. The resolver below tries to map the
406
+ * captured text to an actual file under the project root.
407
+ */
408
+ const IMPORT_PATTERNS = [
409
+ // Python: `from foo.bar import X` (capture `foo.bar`)
410
+ /^\s*from\s+([\w.]+)\s+import\b/gm,
411
+ // Python / Java / Go / Kotlin: `import foo.bar`
412
+ /^\s*import\s+(?:[\w*{}\s,]+from\s+)?["']?([\w./@-]+)["']?/gm,
413
+ // Ruby: `require 'foo'` and `require_relative './foo'`
414
+ /^\s*require(?:_relative)?\s+["']([^"']+)["']/gm,
415
+ // Rust / Elixir / Perl / PHP: `use foo::Bar`
416
+ /^\s*use\s+([\w:.]+)/gm,
417
+ // C / C++: `#include "foo.h"` or `<foo.h>`
418
+ /^\s*#\s*include\s+[<"]([^>"]+)[>"]/gm,
419
+ // Pascal: `uses Math, SysUtils, MyUnit;` — multiple names in one line
420
+ /^\s*uses\s+([\w\s,]+);/gim,
421
+ // OCaml / F#: `open Foo`
422
+ /^\s*open\s+([\w.]+)/gm,
423
+ // Lua: `require 'foo'`
424
+ /^\s*(?:local\s+\w+\s*=\s*)?require\s*\(?\s*["']([^"']+)["']/gm,
425
+ // Elixir: `alias MyApp.Module` and `import MyApp.Module`
426
+ /^\s*alias\s+([\w.]+)/gm,
427
+ // Shell: `source path/to/script.sh` and the `.` synonym.
428
+ // The trailing path can be quoted or bare.
429
+ /^\s*(?:source|\.)\s+["']?([^"'\s]+)["']?/gm,
430
+ // Verilog / SystemVerilog: `` `include "file.v" ``
431
+ // Backtick-prefix is unique to Verilog preprocessor directives —
432
+ // can't be confused with any other language. Captures the path.
433
+ /^\s*`include\s+["<]([^>"]+)[>"]/gm,
434
+ // COBOL: `COPY copybook.` or `COPY copybook.cpy.`
435
+ // The trailing period is COBOL's statement terminator — required
436
+ // by the grammar so this won't FP-match prose. Case-insensitive
437
+ // because real COBOL mixes cases despite convention.
438
+ /^\s*COPY\s+["']?([\w.-]+)["']?\s*\./gim,
439
+ // Vim script: `:runtime path/to/file.vim` (`!` is the bang variant)
440
+ // — the editor's analogue of `source` for files under the runtimepath.
441
+ /^\s*:?\s*runtime!?\s+(?:[\w/]+\s+)*([\w./~-]+\.vim)\b/gm,
442
+ ];
443
+ /* ── path-resolvable string detection (config files) ──────────────── */
444
+ const CONFIG_EXTS = new Set([
445
+ ".json", ".jsonc", ".json5",
446
+ ".yml", ".yaml",
447
+ ".toml",
448
+ // Terraform / OpenTofu — `source = "../modules/x"` is a path-string
449
+ // reference of exactly the kind this ingester catches. Treated as
450
+ // config (not source) since we don't extract HCL definitions.
451
+ ".tf", ".tfvars",
452
+ // Older Unix-style config files — INI sections rarely reference
453
+ // paths but when they do (`include=…`) we want to catch it.
454
+ ".ini", ".cfg", ".conf",
455
+ ]);
456
+ export async function ingestCrossRefs(repo, root, opts = {}) {
457
+ const rarityThreshold = Math.max(1, Math.round(opts.rarityThreshold ?? 3));
458
+ const maxFilesLimit = Math.max(1, Math.round(opts.maxFiles ?? MAX_FILES));
459
+ const maxEdgesLimit = Math.max(1, Math.round(opts.maxEdges ?? MAX_EDGES_TOTAL));
460
+ // ── Walk: collect all candidate files ──────────────────────────────
461
+ const allFiles = await collectFiles(root, maxFilesLimit);
462
+ // ── Pass 1: definitions per file → identifier → defining-files map ─
463
+ const defs = new Map();
464
+ // identifier → list of (file, line) for filename-coupling resolution
465
+ // (we keep the file; the line isn't needed for the edge logic).
466
+ let totalDefs = 0;
467
+ for (const f of allFiles) {
468
+ const ext = extname(f.rel).toLowerCase();
469
+ const patterns = DEFINITION_PATTERNS[ext] ?? GENERIC_DEFINITION_PATTERNS;
470
+ let perFileDefs = 0;
471
+ for (const pat of patterns) {
472
+ for (const m of f.content.matchAll(pat)) {
473
+ const ident = m[1];
474
+ if (!ident || !isUsefulIdentifier(ident))
475
+ continue;
476
+ if (!defs.has(ident))
477
+ defs.set(ident, new Set());
478
+ const set = defs.get(ident);
479
+ if (!set.has(f.rel)) {
480
+ set.add(f.rel);
481
+ perFileDefs += 1;
482
+ totalDefs += 1;
483
+ }
484
+ // For dotted (Elixir, Python) or double-colon (Perl) namespaced
485
+ // identifiers, ALSO register the trailing segment as defined
486
+ // here. Real-world reference sites use the trailing segment
487
+ // alone after an `alias`/`use` line:
488
+ //
489
+ // defmodule MyApp.Auth do ... ← lib/auth.ex
490
+ // alias MyApp.Auth ← lib/router.ex
491
+ // Auth.verify(token) ← uses bare "Auth"
492
+ //
493
+ // Without this, `Auth` is never in `defs` and the corroboration
494
+ // pass can't connect router.ex → auth.ex via filename coupling.
495
+ // Filename coupling + rarity is what keeps FP low.
496
+ if (/[.:]/.test(ident)) {
497
+ const segments = ident.split(/[.:]+/).filter((s) => s.length > 0);
498
+ const last = segments[segments.length - 1];
499
+ if (last && last !== ident && isUsefulIdentifier(last)) {
500
+ if (!defs.has(last))
501
+ defs.set(last, new Set());
502
+ const set2 = defs.get(last);
503
+ if (!set2.has(f.rel)) {
504
+ set2.add(f.rel);
505
+ totalDefs += 1;
506
+ }
507
+ }
508
+ }
509
+ if (perFileDefs >= MAX_DEFS_PER_FILE)
510
+ break;
511
+ }
512
+ if (perFileDefs >= MAX_DEFS_PER_FILE)
513
+ break;
514
+ }
515
+ }
516
+ // ── Edge accumulator. Same (src, tgt) merges. ──────────────────────
517
+ const edges = new Map();
518
+ const fileSet = new Set(allFiles.map((f) => f.rel));
519
+ // ── Pass 2: import-line path resolution ────────────────────────────
520
+ // The captured module/path is normalised to candidate relative paths
521
+ // and checked against fileSet. Existence is the gate.
522
+ for (const f of allFiles) {
523
+ for (const pat of IMPORT_PATTERNS) {
524
+ for (const m of f.content.matchAll(pat)) {
525
+ const raw = m[1];
526
+ if (!raw)
527
+ continue;
528
+ // Pascal `uses` lists are comma-separated.
529
+ const names = raw.includes(",") ? raw.split(",").map((s) => s.trim()) : [raw.trim()];
530
+ for (const name of names) {
531
+ const candidates = resolveImportToFiles(name, f.rel, fileSet);
532
+ for (const tgt of candidates) {
533
+ if (tgt === f.rel)
534
+ continue;
535
+ addEdge(edges, f.rel, tgt, "import-resolved", maxEdgesLimit);
536
+ }
537
+ }
538
+ }
539
+ }
540
+ }
541
+ // ── Pass 3: config-path-strings ────────────────────────────────────
542
+ for (const f of allFiles) {
543
+ if (!CONFIG_EXTS.has(extname(f.rel).toLowerCase()))
544
+ continue;
545
+ // Best-effort JSON parse for .json / .jsonc / .json5. YAML/TOML
546
+ // get a regex-based string-value extractor — full parsers would
547
+ // add a multi-MB dep for a small precision gain; we already gate
548
+ // on filesystem existence so a mis-extracted string is silently
549
+ // dropped, not a FP.
550
+ const stringValues = extractStringsFromConfig(f.content, extname(f.rel).toLowerCase());
551
+ for (const sv of stringValues) {
552
+ const tgt = resolveConfigPath(sv, f.rel, fileSet);
553
+ if (tgt && tgt !== f.rel) {
554
+ addEdge(edges, f.rel, tgt, "config-path", maxEdgesLimit);
555
+ }
556
+ }
557
+ }
558
+ // ── Pass 4: corroborated identifier mentions ───────────────────────
559
+ // For each file F, tokenise line-by-line. For each token that is
560
+ // defined elsewhere AND is rarity-gated, check the corroboration
561
+ // signals before emitting.
562
+ //
563
+ // Performance: each file is tokenised once. Each token is a hashmap
564
+ // lookup against `defs`. O(N × tokens_per_file).
565
+ for (const f of allFiles) {
566
+ if (edges.size >= maxEdgesLimit)
567
+ break;
568
+ const lines = f.content.split("\n");
569
+ const ext = extname(f.rel).toLowerCase();
570
+ let tokensSeen = 0;
571
+ for (let i = 0; i < lines.length; i++) {
572
+ if (tokensSeen >= MAX_TOKENS_PER_FILE)
573
+ break;
574
+ const line = lines[i];
575
+ // Skip comment-only lines so identifier mentions inside
576
+ // comments don't fire the corroboration gate. The trimmed-line
577
+ // check means inline trailing comments are still kept — we only
578
+ // skip when the WHOLE line is a comment.
579
+ if (lineIsCommentOnly(line, ext))
580
+ continue;
581
+ const isImportLine = lineLooksLikeImport(line);
582
+ // When an import-like line contains a literal path string
583
+ // (e.g. shell `source ./lib.sh`, Ruby `load './script.rb'`,
584
+ // shell `. /etc/foo.sh`), the path is a stronger edge signal
585
+ // than identifier matching — it's filesystem-grounded.
586
+ // Extract any `./…`, `../…`, or `/…` substring from the line,
587
+ // resolve against fileSet, and emit an edge if it lands on a
588
+ // real file. This is INDEPENDENT of the identifier rarity gate
589
+ // because the existence-on-disk check is the gate.
590
+ if (isImportLine) {
591
+ // Two alternatives OR'd together:
592
+ // (1) UNQUOTED leading-slash paths — `source ./lib.sh`,
593
+ // `. /etc/init.d/foo.sh`. Must start with `./`, `../`,
594
+ // or `/` (else any identifier could match).
595
+ // (2) QUOTED paths — `source('lib/stats.R')`,
596
+ // `load "vendor/x.rb"`. Inside `'…'`/`"…"`/backticks,
597
+ // any non-whitespace sequence ending in `.<ext>`
598
+ // qualifies; the filesystem-existence gate grounds it.
599
+ const pathInImport = /(?:^|[\s,(=])(\.{0,2}\/[\w./-]+\.\w{1,8})(?=[\s,;)`'"]|$)|['"`]([^'"`\s\\]{2,200}\.\w{1,8})['"`]/g;
600
+ let pm;
601
+ while ((pm = pathInImport.exec(line)) !== null) {
602
+ const cand = pm[1] ?? pm[2];
603
+ if (!cand)
604
+ continue;
605
+ const resolved = resolveConfigPath(cand, f.rel, fileSet);
606
+ if (resolved && resolved !== f.rel) {
607
+ addEdge(edges, f.rel, resolved, "import-path", maxEdgesLimit);
608
+ }
609
+ }
610
+ }
611
+ // word-shape tokens; skip everything else.
612
+ // Pre-split saves repeated regex work versus matchAll on every iteration.
613
+ const tokens = line.split(/[^\w:]+/).filter((t) => t.length > 0);
614
+ for (const tok of tokens) {
615
+ tokensSeen += 1;
616
+ if (tokensSeen >= MAX_TOKENS_PER_FILE)
617
+ break;
618
+ const definingFiles = defs.get(tok);
619
+ if (!definingFiles || definingFiles.size === 0)
620
+ continue;
621
+ // RARITY GATE: identifier defined in too many files is noise.
622
+ if (definingFiles.size > rarityThreshold)
623
+ continue;
624
+ for (const definedIn of definingFiles) {
625
+ if (definedIn === f.rel)
626
+ continue;
627
+ // CORROBORATION: import-line context OR filename-coupling.
628
+ const couples = filenameCouples(definedIn, tok);
629
+ if (!isImportLine && !couples)
630
+ continue;
631
+ const evidence = isImportLine
632
+ ? (couples ? "import+filename" : "import+identifier")
633
+ : "filename+identifier";
634
+ addEdge(edges, f.rel, definedIn, evidence, maxEdgesLimit);
635
+ }
636
+ }
637
+ }
638
+ }
639
+ // ── Emit edges as memories ─────────────────────────────────────────
640
+ const byEvidence = {};
641
+ let emitted = 0;
642
+ for (const [, ev] of edges) {
643
+ if (emitted >= maxEdgesLimit)
644
+ break;
645
+ const primary = pickPrimaryEvidence(ev.evidences);
646
+ byEvidence[primary] = (byEvidence[primary] ?? 0) + 1;
647
+ const evList = Array.from(ev.evidences).sort().join(" + ");
648
+ repo.insertIfMissing({
649
+ category: CATEGORY,
650
+ subject: `xref:${ev.src}->${ev.tgt}`,
651
+ content: `${ev.src} references ${ev.tgt} (evidence: ${evList}). ` +
652
+ `Cross-reference inferred without a language grammar; treat as a navigation hint.`,
653
+ tags: ["xref", "cross-reference", primary, ...basenameTagsForPair(ev.src, ev.tgt)],
654
+ source: "cross-refs-ingest",
655
+ });
656
+ emitted += 1;
657
+ }
658
+ return {
659
+ filesWalked: allFiles.length,
660
+ definitionsExtracted: totalDefs,
661
+ edgesEmitted: emitted,
662
+ byEvidence,
663
+ };
664
+ }
665
+ async function collectFiles(root, maxFiles = MAX_FILES) {
666
+ const out = [];
667
+ const stack = [root];
668
+ while (stack.length > 0 && out.length < maxFiles) {
669
+ const dir = stack.pop();
670
+ let entries;
671
+ try {
672
+ entries = await readdir(dir, { withFileTypes: true });
673
+ }
674
+ catch {
675
+ continue;
676
+ }
677
+ for (const e of entries) {
678
+ if (e.name.startsWith(".") && !e.name.startsWith(".github") && !e.name.startsWith(".gitlab"))
679
+ continue;
680
+ const abs = join(dir, e.name);
681
+ if (e.isDirectory()) {
682
+ if (!SKIP_DIRS.has(e.name))
683
+ stack.push(abs);
684
+ continue;
685
+ }
686
+ if (!e.isFile())
687
+ continue;
688
+ // Filter by what we actually do something with: definition-
689
+ // patterns key set ∪ config exts ∪ a few extras for the
690
+ // "generic" pattern fallback. Exhaustive list would be 100+
691
+ // extensions; this captures the common cases we want to walk.
692
+ const ext = extname(e.name).toLowerCase();
693
+ if (!shouldWalkPath(e.name, ext))
694
+ continue;
695
+ let s;
696
+ try {
697
+ s = await stat(abs);
698
+ }
699
+ catch {
700
+ continue;
701
+ }
702
+ if (!s.isFile() || s.size === 0 || s.size > MAX_FILE_BYTES)
703
+ continue;
704
+ let content;
705
+ try {
706
+ content = await readFile(abs, "utf-8");
707
+ }
708
+ catch {
709
+ continue;
710
+ }
711
+ if (content.indexOf("\0") >= 0)
712
+ continue; // binary
713
+ const rel = relative(root, abs).split(sep).join("/");
714
+ out.push({ abs, rel, content });
715
+ if (out.length >= maxFiles)
716
+ break;
717
+ }
718
+ }
719
+ return out;
720
+ }
721
+ /** No-extension filenames worth walking — Docker/Makefile/Ruby
722
+ * ecosystem files that other parts of the repo legitimately reference
723
+ * by name. Recognising them here lets the directory-resolution
724
+ * branch of `resolveConfigPath` find them as targets (e.g. Docker
725
+ * Compose `build: services/api` → `services/api/Dockerfile`). */
726
+ const NO_EXTENSION_BASENAMES = new Set([
727
+ "Dockerfile",
728
+ "Containerfile",
729
+ "Makefile",
730
+ "Rakefile",
731
+ "Gemfile",
732
+ "Vagrantfile",
733
+ "Procfile",
734
+ "Brewfile",
735
+ "Justfile",
736
+ ]);
737
+ function shouldWalkPath(basenameStr, ext) {
738
+ if (shouldWalkExtension(ext))
739
+ return true;
740
+ if (NO_EXTENSION_BASENAMES.has(basenameStr))
741
+ return true;
742
+ return false;
743
+ }
744
+ function shouldWalkExtension(ext) {
745
+ if (DEFINITION_PATTERNS[ext])
746
+ return true;
747
+ if (CONFIG_EXTS.has(ext))
748
+ return true;
749
+ // Generic-pattern extensions — languages we don't have a dedicated
750
+ // pattern set for but the generic patterns still catch some defs.
751
+ const extras = new Set([
752
+ ".java", ".kt", ".kts", ".scala", ".dart", ".cs", ".vb", ".php",
753
+ ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
754
+ ".py", ".pyi", ".go", ".rs", ".c", ".h", ".cc", ".cpp", ".hpp",
755
+ ]);
756
+ return extras.has(ext);
757
+ }
758
+ function isUsefulIdentifier(s) {
759
+ // 3-50 chars; reject very generic names that produce massive FP.
760
+ if (s.length < 3 || s.length > 50)
761
+ return false;
762
+ const GENERIC = new Set([
763
+ "self", "this", "new", "true", "false", "nil", "null", "none",
764
+ "var", "let", "const", "def", "fn", "func", "function", "return",
765
+ "end", "begin", "if", "else", "for", "while", "do", "in", "of",
766
+ "and", "or", "not", "module", "class", "type", "value", "name",
767
+ "data", "item", "list", "map", "set", "get", "put", "key", "val",
768
+ "main", "init", "run", "test", "spec", "src", "lib", "app",
769
+ ]);
770
+ return !GENERIC.has(s.toLowerCase());
771
+ }
772
+ /** Decide whether a single source line looks like an import/require/use
773
+ * statement in ANY language. Conservative — false negatives are fine
774
+ * (the filename-coupling signal picks up the slack); false positives
775
+ * here cost us in the corroboration gate. */
776
+ /**
777
+ * Per-extension line-comment prefixes used to skip pure-comment lines
778
+ * during the mention scan. A line whose trimmed content starts with
779
+ * any of these prefixes is not tokenised — preventing the FP where
780
+ * a token like `alu` mentioned in a `; comment` in a Lisp file would
781
+ * filename-couple to `alu.v` and emit a spurious edge.
782
+ *
783
+ * Conservative — we only list extensions where the prefix is
784
+ * unambiguously a comment marker. Languages like JS/Java where `//`
785
+ * is a comment AND `/` is a path separator are listed; the trimmed-
786
+ * line check (`line.trimStart().startsWith(prefix)`) means inline
787
+ * comments preceded by code are NOT stripped, only comment-only
788
+ * lines are skipped.
789
+ */
790
+ const LINE_COMMENT_PREFIXES = {
791
+ // Lisp family — the FP that motivated this table.
792
+ ".lisp": [";"], ".cl": [";"], ".lsp": [";"],
793
+ ".rkt": [";"], ".scm": [";"], ".ss": [";"], ".clj": [";"], ".cljs": [";"],
794
+ // `;` is also a comment marker in assembly + some Scheme dialects.
795
+ ".asm": [";"], ".s": [";"],
796
+ // # — Python, Ruby, Perl, shell, YAML, TOML, R, Tcl, Nim, Crystal, Elixir.
797
+ ".py": ["#"], ".pyi": ["#"],
798
+ ".rb": ["#"], ".pl": ["#"], ".pm": ["#"],
799
+ ".sh": ["#"], ".bash": ["#"], ".zsh": ["#"],
800
+ ".yml": ["#"], ".yaml": ["#"], ".toml": ["#"],
801
+ ".r": ["#"], ".tcl": ["#"], ".nim": ["#"], ".cr": ["#"],
802
+ ".ex": ["#"], ".exs": ["#"],
803
+ // C family — // (block /* */ handled by stripping in regex too)
804
+ ".js": ["//"], ".jsx": ["//"], ".mjs": ["//"], ".cjs": ["//"],
805
+ ".ts": ["//"], ".tsx": ["//"], ".mts": ["//"], ".cts": ["//"],
806
+ ".c": ["//"], ".h": ["//"], ".cpp": ["//"], ".cc": ["//"], ".cxx": ["//"], ".hpp": ["//"], ".hxx": ["//"],
807
+ ".java": ["//"], ".kt": ["//"], ".kts": ["//"], ".scala": ["//"],
808
+ ".swift": ["//"], ".dart": ["//"], ".cs": ["//"],
809
+ ".go": ["//"], ".rs": ["//"], ".sol": ["//"], ".d": ["//"], ".zig": ["//"],
810
+ ".php": ["//", "#"], // PHP supports both
811
+ ".v": ["//"], ".sv": ["//"], ".vh": ["//"], ".svh": ["//"],
812
+ // -- in SQL, Haskell, Ada, VHDL, Lua
813
+ ".sql": ["--"], ".hs": ["--"], ".lhs": ["--"],
814
+ ".lua": ["--"], ".adb": ["--"], ".ads": ["--"],
815
+ ".vhd": ["--"], ".vhdl": ["--"],
816
+ // % in Erlang, MATLAB
817
+ ".erl": ["%"], ".hrl": ["%"], ".m": ["%"],
818
+ // ' in VB
819
+ ".vb": ["'"],
820
+ // " in vim script
821
+ ".vim": ['"'],
822
+ // // in modern Pascal dialects (FPC/Delphi)
823
+ ".pas": ["//"], ".pp": ["//"], ".dpr": ["//"], ".lpr": ["//"],
824
+ // COBOL — *> in free-form, * in fixed-form (col 7). Trimmed-line
825
+ // start with these covers both.
826
+ ".cob": ["*>", "*"], ".cbl": ["*>", "*"], ".cpy": ["*>", "*"],
827
+ // Fortran — ! for free-form .f90+
828
+ ".f90": ["!"], ".f95": ["!"], ".f03": ["!"], ".f08": ["!"],
829
+ };
830
+ function lineIsCommentOnly(line, ext) {
831
+ const prefixes = LINE_COMMENT_PREFIXES[ext];
832
+ if (!prefixes)
833
+ return false;
834
+ const trimmed = line.trimStart();
835
+ if (trimmed.length === 0)
836
+ return false;
837
+ for (const p of prefixes) {
838
+ if (trimmed.startsWith(p))
839
+ return true;
840
+ }
841
+ return false;
842
+ }
843
+ function lineLooksLikeImport(line) {
844
+ // Verbs that introduce an import-style cross-file reference in any
845
+ // language we support. `alias` covers Elixir's `alias Foo.Bar`;
846
+ // `source` covers shell's `source ./lib.sh` (and Tcl's `source`).
847
+ return /^\s*(?:import|from|require|require_relative|use|using|include|#\s*include|uses|open|package|extends|implements|alias|source)\b/i.test(line);
848
+ }
849
+ /**
850
+ * Map an imported name/path to one or more relative file paths under
851
+ * the project root. Tries multiple candidate shapes per import; only
852
+ * those that exist in `fileSet` are returned.
853
+ *
854
+ * Examples (with `fromFile = lib/main.py`):
855
+ * "foo.bar" → ["foo/bar.py", "foo/bar/__init__.py", "lib/foo/bar.py", "lib/foo/bar/__init__.py"]
856
+ * "./user_service" → ["lib/user_service.rb", "lib/user_service.py", ...]
857
+ * "../utils" → ["utils.rb", "utils.py", ...]
858
+ * "Greeter" → ["Greeter.pm", "Greeter.pas", ...]
859
+ */
860
+ function resolveImportToFiles(rawName, fromFile, fileSet) {
861
+ if (!rawName)
862
+ return [];
863
+ const fromDir = dirname(fromFile);
864
+ // Discard import names that obviously belong to standard libraries
865
+ // or external packages (no slashes, no path-shape, and the leading
866
+ // segment looks like a stdlib name). False negatives here are
867
+ // cheap; false positives blow up.
868
+ // Heuristic: relative paths (./ or ../) ALWAYS attempt resolution.
869
+ // Bare names attempt resolution but the existence check in fileSet
870
+ // filters most spurious matches.
871
+ const candidates = [];
872
+ const exts = [
873
+ ".rb", ".pl", ".pm", ".pas", ".pp", ".lua", ".ex", ".exs", ".erl",
874
+ ".swift", ".kt", ".kts", ".scala", ".dart", ".zig", ".nim",
875
+ ".ml", ".mli", ".fs", ".hs", ".clj", ".vb", ".tcl", ".r",
876
+ ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
877
+ ".py", ".pyi", ".go", ".rs", ".c", ".h", ".cpp", ".hpp", ".cc",
878
+ ".cs", ".php", ".sh", ".bash",
879
+ "", // shell scripts with no extension
880
+ ];
881
+ const tryWithExts = (base) => {
882
+ for (const ext of exts) {
883
+ candidates.push(`${base}${ext}`);
884
+ }
885
+ // Python-style package init.
886
+ candidates.push(`${base}/__init__.py`);
887
+ candidates.push(`${base}/index.js`);
888
+ candidates.push(`${base}/index.ts`);
889
+ candidates.push(`${base}/mod.rs`);
890
+ candidates.push(`${base}/lib.rs`);
891
+ };
892
+ const normalised = rawName.trim();
893
+ if (!normalised)
894
+ return [];
895
+ // Universal fallback — try the captured name verbatim as a
896
+ // filename, both at the project root and as a sibling of the
897
+ // importing file. Catches cases where the name has a dot that's
898
+ // NOT a module separator but a file extension (COBOL `COPY
899
+ // customer.cpy`, Verilog `\`include "alu.v"`, anything where the
900
+ // pattern captures `name.ext` directly). For dotted module names
901
+ // these candidates won't exist as files, so this branch is a no-op
902
+ // for the dotted-module case.
903
+ candidates.push(normalised);
904
+ candidates.push(pathJoinNoCollapse(fromDir, normalised));
905
+ // Relative path imports
906
+ if (normalised.startsWith("./") || normalised.startsWith("../")) {
907
+ const resolved = pathJoinNoCollapse(fromDir, normalised);
908
+ tryWithExts(resolved);
909
+ // Direct as-is (already has extension)
910
+ candidates.push(resolved);
911
+ }
912
+ else if (normalised.includes("/") || normalised.includes("\\")) {
913
+ // Path-like but not explicitly relative — try as-is from project root
914
+ tryWithExts(normalised.replace(/\\/g, "/"));
915
+ candidates.push(normalised.replace(/\\/g, "/"));
916
+ }
917
+ else {
918
+ // Dotted name (Python/Java/Elixir): foo.bar → foo/bar
919
+ // Colon name (Perl): Foo::Bar → Foo/Bar
920
+ const pathish = normalised.replace(/[.:]+/g, "/");
921
+ tryWithExts(pathish);
922
+ // Also try sibling: lib/main.py importing `helpers` may mean lib/helpers.py
923
+ tryWithExts(join(fromDir, pathish).replace(/\\/g, "/"));
924
+ // Try as bare name (Perl `use Greeter` → Greeter.pm at any depth)
925
+ candidates.push(`${normalised}.pm`);
926
+ candidates.push(`${normalised}.pas`);
927
+ candidates.push(`${normalised}.pp`);
928
+ candidates.push(`${normalised}.lua`);
929
+ // sibling versions of bare names
930
+ candidates.push(join(fromDir, `${normalised}.pm`).replace(/\\/g, "/"));
931
+ candidates.push(join(fromDir, `${normalised}.pas`).replace(/\\/g, "/"));
932
+ candidates.push(join(fromDir, `${normalised}.pp`).replace(/\\/g, "/"));
933
+ candidates.push(join(fromDir, `${normalised}.lua`).replace(/\\/g, "/"));
934
+ }
935
+ const found = new Set();
936
+ for (const c of candidates) {
937
+ const normalised = c.replace(/\\/g, "/").replace(/\/+/g, "/");
938
+ if (fileSet.has(normalised))
939
+ found.add(normalised);
940
+ }
941
+ return Array.from(found);
942
+ }
943
+ /**
944
+ * Like `path.join` but preserves "./" prefix and resolves "../" by
945
+ * walking up segments — without depending on node:path's behaviour
946
+ * which collapses to absolute on some inputs.
947
+ */
948
+ function pathJoinNoCollapse(from, relPath) {
949
+ const parts = (from === "" ? [] : from.split("/"));
950
+ const relParts = relPath.replace(/\\/g, "/").split("/");
951
+ for (const p of relParts) {
952
+ if (p === "" || p === ".")
953
+ continue;
954
+ if (p === "..") {
955
+ if (parts.length > 0)
956
+ parts.pop();
957
+ continue;
958
+ }
959
+ parts.push(p);
960
+ }
961
+ return parts.join("/");
962
+ }
963
+ /* ── config-string extraction ─────────────────────────────────────── */
964
+ function extractStringsFromConfig(content, ext) {
965
+ if (ext === ".json" || ext === ".jsonc" || ext === ".json5") {
966
+ // Best-effort parse. JSONC: strip // and /* */ first. JSON5 we
967
+ // attempt as JSONC; on parse failure we fall through to the
968
+ // regex extractor.
969
+ const stripped = ext === ".json" ? content : content.replace(/\/\*[\s\S]*?\*\//g, "").replace(/^\s*\/\/.*$/gm, "");
970
+ try {
971
+ const obj = JSON.parse(stripped);
972
+ const out = [];
973
+ walkJson(obj, out);
974
+ return out;
975
+ }
976
+ catch {
977
+ // Fall through to regex extractor below.
978
+ }
979
+ }
980
+ // Regex fallback: any double-quoted string. False positives here
981
+ // are fine — they get filtered by the filesystem-existence gate.
982
+ const matches = content.matchAll(/"([^"\n]{1,200})"/g);
983
+ const out = [];
984
+ for (const m of matches)
985
+ out.push(m[1]);
986
+ // YAML: also single-quoted strings, and bare path-shaped scalars
987
+ // (heuristic; the existence gate filters).
988
+ const single = content.matchAll(/'([^'\n]{1,200})'/g);
989
+ for (const m of single)
990
+ out.push(m[1]);
991
+ // YAML/TOML unquoted scalar values. The single most common GitHub
992
+ // Actions / Ansible / Docker Compose / CI-DSL idiom is unquoted —
993
+ // run: ./scripts/build.sh
994
+ // path = "lib/x.js" (TOML, already caught above by quotes)
995
+ // entrypoint: bin/server
996
+ // — so without this branch we miss the headline use case for
997
+ // low-code-DSL connection discovery. Match `key: value` (YAML) and
998
+ // `key = value` (TOML) where the value is unquoted, contains no
999
+ // whitespace, and isn't a comment. The filesystem-existence gate
1000
+ // downstream filters everything that isn't a real file.
1001
+ if (ext === ".yml" || ext === ".yaml" || ext === ".toml") {
1002
+ const scalarLine = /^[ \t]*-?[ \t]*[\w\-.]+[ \t]*[:=][ \t]*([^\s#'"`[{][^\s#]*?)[ \t]*(?:#.*)?$/gm;
1003
+ let m;
1004
+ while ((m = scalarLine.exec(content)) !== null) {
1005
+ const v = m[1];
1006
+ // Reject values that obviously aren't paths: pure numbers,
1007
+ // pure booleans, version specifiers, scalar YAML markers.
1008
+ if (!v)
1009
+ continue;
1010
+ if (/^(true|false|null|yes|no|on|off|~)$/i.test(v))
1011
+ continue;
1012
+ if (/^-?\d+(\.\d+)?$/.test(v))
1013
+ continue;
1014
+ if (/^[a-z]+:\/\//i.test(v))
1015
+ continue; // URLs
1016
+ out.push(v);
1017
+ }
1018
+ // YAML bare-list items: ` - deployment.yaml` — Kustomize's
1019
+ // `resources:` list, Ansible's `roles:` list, and most other
1020
+ // no-code DSLs use this shape. The previous `scalarLine` regex
1021
+ // requires a `key: value` shape and misses these. The same
1022
+ // existence gate downstream filters non-paths.
1023
+ const bareListItem = /^[ \t]*-[ \t]+([^\s#'"`[{][^\s#]*?)[ \t]*(?:#.*)?$/gm;
1024
+ while ((m = bareListItem.exec(content)) !== null) {
1025
+ const v = m[1];
1026
+ if (!v)
1027
+ continue;
1028
+ if (/^(true|false|null|yes|no|on|off|~)$/i.test(v))
1029
+ continue;
1030
+ if (/^-?\d+(\.\d+)?$/.test(v))
1031
+ continue;
1032
+ if (/^[a-z]+:\/\//i.test(v))
1033
+ continue;
1034
+ out.push(v);
1035
+ }
1036
+ }
1037
+ return out;
1038
+ }
1039
+ function walkJson(node, out) {
1040
+ if (node === null || node === undefined)
1041
+ return;
1042
+ if (typeof node === "string") {
1043
+ if (node.length >= 2 && node.length < 500)
1044
+ out.push(node);
1045
+ return;
1046
+ }
1047
+ if (Array.isArray(node)) {
1048
+ for (const v of node)
1049
+ walkJson(v, out);
1050
+ return;
1051
+ }
1052
+ if (typeof node === "object") {
1053
+ for (const [k, v] of Object.entries(node)) {
1054
+ // Hint-tracking: the key name doesn't gate emission (existence
1055
+ // does) but PATH_KEY_HINTS is useful for the evidence label.
1056
+ // For now we just descend; key hints are advisory only.
1057
+ void k;
1058
+ walkJson(v, out);
1059
+ }
1060
+ }
1061
+ }
1062
+ /**
1063
+ * Map a candidate string from a config file to a relative path under
1064
+ * the project root. Tries (a) direct path under root, (b) relative to
1065
+ * config file's directory. Returns the resolved file if it exists in
1066
+ * `fileSet`; null otherwise.
1067
+ */
1068
+ function resolveConfigPath(value, fromFile, fileSet) {
1069
+ if (!value || value.length < 2 || value.length > 500)
1070
+ return null;
1071
+ // Skip values that obviously aren't paths.
1072
+ if (value.includes("\n"))
1073
+ return null;
1074
+ // Skip URLs.
1075
+ if (/^https?:\/\//.test(value) || /^[a-z]+:\/\//.test(value))
1076
+ return null;
1077
+ const fromDir = dirname(fromFile);
1078
+ const cleaned = value.replace(/\\/g, "/").replace(/^\.\//, "").replace(/\/$/, "");
1079
+ const candidates = [
1080
+ cleaned, // as-is relative to project root
1081
+ pathJoinNoCollapse(fromDir, value).replace(/\/$/, ""), // relative to the config file
1082
+ ];
1083
+ // Direct file match — covers the typical case: a config value
1084
+ // points at an exact file in the repo.
1085
+ for (const c of candidates) {
1086
+ const norm = c.replace(/\\/g, "/").replace(/\/+/g, "/").replace(/^\//, "");
1087
+ if (fileSet.has(norm))
1088
+ return norm;
1089
+ }
1090
+ // Directory-shaped match — covers Docker Compose `build:
1091
+ // services/api`, GitHub composite actions `uses: ./.github/actions/build`,
1092
+ // and any other DSL convention where a value points at a DIRECTORY
1093
+ // whose canonical entry file is the actual edge target. We prefer
1094
+ // well-known entry-point filenames (Dockerfile, action.yml,
1095
+ // package.json, …) when present; otherwise the directory has no
1096
+ // canonical entry and we don't fabricate one.
1097
+ const ENTRY_FILE_CANDIDATES = [
1098
+ "Dockerfile",
1099
+ "action.yml",
1100
+ "action.yaml",
1101
+ "package.json",
1102
+ "Cargo.toml",
1103
+ "go.mod",
1104
+ "index.js",
1105
+ "index.ts",
1106
+ "main.py",
1107
+ "__init__.py",
1108
+ "mod.rs",
1109
+ "build.gradle",
1110
+ "build.gradle.kts",
1111
+ "main.tf",
1112
+ ];
1113
+ for (const c of candidates) {
1114
+ const dirNorm = c.replace(/\\/g, "/").replace(/\/+/g, "/").replace(/^\//, "");
1115
+ if (!dirNorm || fileSet.has(dirNorm))
1116
+ continue;
1117
+ // Only treat it as a directory if at least one file under that
1118
+ // prefix exists. (Avoids fabricating edges to "directories" that
1119
+ // are really nonexistent paths.)
1120
+ const dirPrefix = dirNorm + "/";
1121
+ let hasAnyFile = false;
1122
+ for (const f of fileSet) {
1123
+ if (f.startsWith(dirPrefix)) {
1124
+ hasAnyFile = true;
1125
+ break;
1126
+ }
1127
+ }
1128
+ if (!hasAnyFile)
1129
+ continue;
1130
+ for (const entry of ENTRY_FILE_CANDIDATES) {
1131
+ const probe = dirPrefix + entry;
1132
+ if (fileSet.has(probe))
1133
+ return probe;
1134
+ }
1135
+ // Directory exists but has no canonical entry → don't emit;
1136
+ // the agent can navigate by directory name from the config
1137
+ // memory itself, and we'd rather skip than emit a noisy edge
1138
+ // to some random file.
1139
+ }
1140
+ return null;
1141
+ }
1142
+ /* ── filename-class coupling ──────────────────────────────────────── */
1143
+ /**
1144
+ * Returns true if the file's basename plausibly corresponds to the
1145
+ * identifier — `user_service.rb` ↔ `UserService`, `MyUnit.pas` ↔
1146
+ * `MyUnit`. This is a corroboration signal, not the only signal.
1147
+ */
1148
+ function filenameCouples(file, identifier) {
1149
+ const base = basename(file, extname(file));
1150
+ const candidates = new Set([
1151
+ base,
1152
+ toCamelCase(base),
1153
+ toPascalCase(base),
1154
+ ]);
1155
+ if (candidates.has(identifier))
1156
+ return true;
1157
+ const lc = identifier.toLowerCase();
1158
+ for (const c of candidates) {
1159
+ if (c.toLowerCase() === lc)
1160
+ return true;
1161
+ }
1162
+ return false;
1163
+ }
1164
+ function toCamelCase(s) {
1165
+ return s.replace(/[_-](\w)/g, (_, c) => c.toUpperCase());
1166
+ }
1167
+ function toPascalCase(s) {
1168
+ const c = toCamelCase(s);
1169
+ return c.charAt(0).toUpperCase() + c.slice(1);
1170
+ }
1171
+ /* ── edge bookkeeping ─────────────────────────────────────────────── */
1172
+ function addEdge(edges, src, tgt, evidence, maxEdges = MAX_EDGES_TOTAL) {
1173
+ if (edges.size >= maxEdges)
1174
+ return;
1175
+ const key = `${src}\x00${tgt}`;
1176
+ let e = edges.get(key);
1177
+ if (!e) {
1178
+ e = { src, tgt, evidences: new Set() };
1179
+ edges.set(key, e);
1180
+ }
1181
+ e.evidences.add(evidence);
1182
+ }
1183
+ function pickPrimaryEvidence(evidences) {
1184
+ // Ordered by confidence: filesystem-grounded signals first.
1185
+ const order = [
1186
+ "config-path",
1187
+ "import-resolved",
1188
+ "import+filename",
1189
+ "import+identifier",
1190
+ "filename+identifier",
1191
+ ];
1192
+ for (const e of order) {
1193
+ if (evidences.has(e))
1194
+ return e;
1195
+ }
1196
+ return Array.from(evidences)[0] ?? "unknown";
1197
+ }
1198
+ function basenameTagsForPair(src, tgt) {
1199
+ const a = basename(src, extname(src)).toLowerCase().replace(/[^a-z0-9]+/g, "-");
1200
+ const b = basename(tgt, extname(tgt)).toLowerCase().replace(/[^a-z0-9]+/g, "-");
1201
+ const out = [];
1202
+ if (a)
1203
+ out.push(a);
1204
+ if (b && b !== a)
1205
+ out.push(b);
1206
+ return out;
1207
+ }