opencode-diane 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +180 -0
- package/LICENSE +21 -0
- package/README.md +206 -0
- package/WIKI.md +1430 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.js +1632 -0
- package/dist/ingest/adaptive.d.ts +47 -0
- package/dist/ingest/adaptive.js +182 -0
- package/dist/ingest/code-health.d.ts +58 -0
- package/dist/ingest/code-health.js +202 -0
- package/dist/ingest/code-map.d.ts +71 -0
- package/dist/ingest/code-map.js +670 -0
- package/dist/ingest/cross-refs.d.ts +59 -0
- package/dist/ingest/cross-refs.js +1207 -0
- package/dist/ingest/docs.d.ts +49 -0
- package/dist/ingest/docs.js +325 -0
- package/dist/ingest/git.d.ts +77 -0
- package/dist/ingest/git.js +390 -0
- package/dist/ingest/live-session.d.ts +101 -0
- package/dist/ingest/live-session.js +173 -0
- package/dist/ingest/project-notes.d.ts +28 -0
- package/dist/ingest/project-notes.js +102 -0
- package/dist/ingest/project.d.ts +35 -0
- package/dist/ingest/project.js +430 -0
- package/dist/ingest/session-snapshot.d.ts +63 -0
- package/dist/ingest/session-snapshot.js +94 -0
- package/dist/ingest/sessions.d.ts +29 -0
- package/dist/ingest/sessions.js +164 -0
- package/dist/ingest/tables.d.ts +52 -0
- package/dist/ingest/tables.js +360 -0
- package/dist/mining/skill-miner.d.ts +53 -0
- package/dist/mining/skill-miner.js +234 -0
- package/dist/search/bm25.d.ts +81 -0
- package/dist/search/bm25.js +334 -0
- package/dist/search/e5-embedder.d.ts +30 -0
- package/dist/search/e5-embedder.js +91 -0
- package/dist/search/embed-pass.d.ts +26 -0
- package/dist/search/embed-pass.js +43 -0
- package/dist/search/embedder.d.ts +58 -0
- package/dist/search/embedder.js +85 -0
- package/dist/search/inverted-index.d.ts +51 -0
- package/dist/search/inverted-index.js +139 -0
- package/dist/search/ppr.d.ts +44 -0
- package/dist/search/ppr.js +118 -0
- package/dist/search/tokenize.d.ts +26 -0
- package/dist/search/tokenize.js +98 -0
- package/dist/store/eviction.d.ts +16 -0
- package/dist/store/eviction.js +37 -0
- package/dist/store/repository.d.ts +222 -0
- package/dist/store/repository.js +420 -0
- package/dist/store/sqlite-store.d.ts +89 -0
- package/dist/store/sqlite-store.js +252 -0
- package/dist/store/vector-store.d.ts +66 -0
- package/dist/store/vector-store.js +160 -0
- package/dist/types.d.ts +385 -0
- package/dist/types.js +9 -0
- package/dist/utils/file-log.d.ts +87 -0
- package/dist/utils/file-log.js +215 -0
- package/dist/utils/peer-detection.d.ts +45 -0
- package/dist/utils/peer-detection.js +90 -0
- package/dist/utils/shell.d.ts +43 -0
- package/dist/utils/shell.js +110 -0
- package/dist/utils/usage-skill.d.ts +42 -0
- package/dist/utils/usage-skill.js +129 -0
- package/dist/utils/xlsx.d.ts +36 -0
- package/dist/utils/xlsx.js +270 -0
- package/grammars/tree-sitter-c.wasm +0 -0
- package/grammars/tree-sitter-c_sharp.wasm +0 -0
- package/grammars/tree-sitter-cpp.wasm +0 -0
- package/grammars/tree-sitter-css.wasm +0 -0
- package/grammars/tree-sitter-go.wasm +0 -0
- package/grammars/tree-sitter-html.wasm +0 -0
- package/grammars/tree-sitter-java.wasm +0 -0
- package/grammars/tree-sitter-javascript.wasm +0 -0
- package/grammars/tree-sitter-json.wasm +0 -0
- package/grammars/tree-sitter-php.wasm +0 -0
- package/grammars/tree-sitter-python.wasm +0 -0
- package/grammars/tree-sitter-rust.wasm +0 -0
- package/grammars/tree-sitter-typescript.wasm +0 -0
- package/package.json +80 -0
|
@@ -0,0 +1,1207 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* cross-refs.ts — grammar-agnostic edge discovery between files.
|
|
3
|
+
*
|
|
4
|
+
* The hard part isn't finding connections; the hard part is finding
|
|
5
|
+
* them with FEW false positives. Single-signal heuristics (regex
|
|
6
|
+
* "import" detection, free-text identifier mentions) all have
|
|
7
|
+
* meaningful FP rates that mislead the agent's navigation. This
|
|
8
|
+
* ingester uses MULTI-SIGNAL CORROBORATION:
|
|
9
|
+
*
|
|
10
|
+
* - Filesystem-grounded signals (path-resolves-to-existing-file)
|
|
11
|
+
* emit edges alone — the disk grounds them.
|
|
12
|
+
* - Lexical signals (identifier mention) only emit edges when
|
|
13
|
+
* corroborated by a SECOND orthogonal signal (rarity + import-
|
|
14
|
+
* line context, OR rarity + filename-class coupling).
|
|
15
|
+
*
|
|
16
|
+
* Four passes:
|
|
17
|
+
*
|
|
18
|
+
* 1. Definition extraction (language-keyed regex). Builds a map
|
|
19
|
+
* `identifier → Set<defining-file>`.
|
|
20
|
+
* 2. Import-path resolution. For each file, find import-like lines,
|
|
21
|
+
* try to resolve the named module/path to an actual file under
|
|
22
|
+
* the project root. Existing → edge.
|
|
23
|
+
* 3. Config path-strings. For .json/.yaml/.toml, walk string values
|
|
24
|
+
* and try to resolve them as relative file paths. Existing → edge.
|
|
25
|
+
* 4. Corroborated identifier mentions. Tokenise each file line by
|
|
26
|
+
* line; for tokens that are defined elsewhere AND rarity-gated,
|
|
27
|
+
* emit an edge only when *also* corroborated by either an
|
|
28
|
+
* import-line context or filename-class coupling.
|
|
29
|
+
*
|
|
30
|
+
* Edges with the same (source, target) pair are merged; the evidence
|
|
31
|
+
* list grows but only one memory is written per edge.
|
|
32
|
+
*
|
|
33
|
+
* **What this catches that code-map doesn't:** any cross-file
|
|
34
|
+
* connection in a language tree-sitter doesn't have a grammar for
|
|
35
|
+
* (Ruby, Pascal, Perl, Lua, Elixir, Erlang, Tcl, Nim, Zig, Swift,
|
|
36
|
+
* Kotlin, Scala, Haskell, OCaml, F#, Clojure, Pascal, VB, …), plus
|
|
37
|
+
* config-style cross-references in JSON/YAML/TOML files.
|
|
38
|
+
*
|
|
39
|
+
* **What it doesn't catch:** dynamic loads (`require(varName)`),
|
|
40
|
+
* reflection-based dispatch, anything where the connection only
|
|
41
|
+
* exists at runtime. Those are out of scope for any static technique.
|
|
42
|
+
*/
|
|
43
|
+
import { readdir, readFile, stat } from "node:fs/promises";
|
|
44
|
+
import { join, relative, sep, extname, dirname, basename } from "node:path";
|
|
45
|
+
const CATEGORY = "project-facts";
|
|
46
|
+
const SKIP_DIRS = new Set([
|
|
47
|
+
"node_modules",
|
|
48
|
+
".git",
|
|
49
|
+
"dist",
|
|
50
|
+
"build",
|
|
51
|
+
"out",
|
|
52
|
+
"target",
|
|
53
|
+
".next",
|
|
54
|
+
"coverage",
|
|
55
|
+
".cache",
|
|
56
|
+
"vendor",
|
|
57
|
+
"tmp",
|
|
58
|
+
"__pycache__",
|
|
59
|
+
]);
|
|
60
|
+
/* ── caps ──────────────────────────────────────────────────────────── */
|
|
61
|
+
const MAX_FILES = 2000;
|
|
62
|
+
const MAX_FILE_BYTES = 256 * 1024;
|
|
63
|
+
const MAX_DEFS_PER_FILE = 200;
|
|
64
|
+
const MAX_TOKENS_PER_FILE = 50_000; // tokenisation guard for huge files
|
|
65
|
+
const MAX_EDGES_TOTAL = 10_000;
|
|
66
|
+
/* ── language-keyed definition patterns ────────────────────────────── */
|
|
67
|
+
/*
|
|
68
|
+
* Each pattern uses /m (multiline ^/$) with one capture group: the
|
|
69
|
+
* identifier name. `g` so we can iterate matches with matchAll.
|
|
70
|
+
*
|
|
71
|
+
* Coverage notes per language. The list is curated by what we observe
|
|
72
|
+
* in real code, not what a language grammar would consider complete.
|
|
73
|
+
*/
|
|
74
|
+
const DEFINITION_PATTERNS = {
|
|
75
|
+
// Ruby — class, module, def. `self.` prefix for class methods kept
|
|
76
|
+
// out of the capture so `UserService.create` mentions match.
|
|
77
|
+
".rb": [
|
|
78
|
+
/^\s*class\s+([A-Z][\w:]*)\b/gm,
|
|
79
|
+
/^\s*module\s+([A-Z][\w:]*)\b/gm,
|
|
80
|
+
/^\s*def\s+(?:self\.)?([\w?!]+)/gm,
|
|
81
|
+
],
|
|
82
|
+
// Pascal / Object Pascal / Delphi. Case-insensitive — Pascal is
|
|
83
|
+
// historically case-insensitive and real code mixes `Procedure`,
|
|
84
|
+
// `procedure`, `PROCEDURE`.
|
|
85
|
+
".pas": [
|
|
86
|
+
/^\s*unit\s+(\w+)\s*;/gim,
|
|
87
|
+
/^\s*procedure\s+(\w+)/gim,
|
|
88
|
+
/^\s*function\s+(\w+)/gim,
|
|
89
|
+
/\btype\s+(\w+)\s*=\s*(?:class|record|interface|object)\b/gim,
|
|
90
|
+
],
|
|
91
|
+
".pp": [
|
|
92
|
+
/^\s*unit\s+(\w+)\s*;/gim,
|
|
93
|
+
/^\s*procedure\s+(\w+)/gim,
|
|
94
|
+
/^\s*function\s+(\w+)/gim,
|
|
95
|
+
/\btype\s+(\w+)\s*=\s*(?:class|record|interface|object)\b/gim,
|
|
96
|
+
],
|
|
97
|
+
".dpr": [/^\s*program\s+(\w+)\s*;/gim],
|
|
98
|
+
// Perl — package, sub. Package names can contain `::` (kept in capture).
|
|
99
|
+
".pl": [
|
|
100
|
+
/^\s*package\s+([\w:]+)\s*;/gm,
|
|
101
|
+
/^\s*sub\s+(\w+)/gm,
|
|
102
|
+
],
|
|
103
|
+
".pm": [
|
|
104
|
+
/^\s*package\s+([\w:]+)\s*;/gm,
|
|
105
|
+
/^\s*sub\s+(\w+)/gm,
|
|
106
|
+
],
|
|
107
|
+
// Lua — function, method, local function. `function Module.name` and
|
|
108
|
+
// `function Module:name` both expose `name`.
|
|
109
|
+
".lua": [
|
|
110
|
+
/^\s*(?:local\s+)?function\s+(\w+)/gm,
|
|
111
|
+
/^\s*function\s+\w+[.:](\w+)/gm,
|
|
112
|
+
/^\s*(\w+)\s*=\s*function\b/gm,
|
|
113
|
+
],
|
|
114
|
+
// Elixir — defmodule (dotted), def/defp/defmacro.
|
|
115
|
+
".ex": [
|
|
116
|
+
/^\s*defmodule\s+([\w.]+)/gm,
|
|
117
|
+
/^\s*defp?\s+(\w+)/gm,
|
|
118
|
+
/^\s*defmacrop?\s+(\w+)/gm,
|
|
119
|
+
],
|
|
120
|
+
".exs": [
|
|
121
|
+
/^\s*defmodule\s+([\w.]+)/gm,
|
|
122
|
+
/^\s*defp?\s+(\w+)/gm,
|
|
123
|
+
],
|
|
124
|
+
// Erlang — module declaration, exported functions.
|
|
125
|
+
".erl": [
|
|
126
|
+
/^-module\(([\w_]+)\)/gm,
|
|
127
|
+
/^([a-z]\w*)\s*\([^)]*\)\s*->/gm, // function clause
|
|
128
|
+
],
|
|
129
|
+
// Swift, Kotlin, Scala, Dart, Zig, Nim — class/func/struct + a few.
|
|
130
|
+
".swift": [
|
|
131
|
+
/^\s*(?:public\s+|private\s+|internal\s+|fileprivate\s+|open\s+)?(?:class|struct|enum|protocol)\s+([A-Z]\w*)\b/gm,
|
|
132
|
+
/^\s*(?:public\s+|private\s+|internal\s+)?func\s+(\w+)/gm,
|
|
133
|
+
],
|
|
134
|
+
".kt": [
|
|
135
|
+
/^\s*(?:public\s+|private\s+|internal\s+|protected\s+|open\s+|abstract\s+)*(?:class|interface|object|enum)\s+([A-Z]\w*)\b/gm,
|
|
136
|
+
/^\s*(?:public\s+|private\s+|internal\s+)?fun\s+(\w+)/gm,
|
|
137
|
+
],
|
|
138
|
+
".kts": [
|
|
139
|
+
/^\s*(?:public\s+|private\s+|internal\s+)*(?:class|interface|object)\s+([A-Z]\w*)\b/gm,
|
|
140
|
+
/^\s*(?:public\s+|private\s+|internal\s+)?fun\s+(\w+)/gm,
|
|
141
|
+
],
|
|
142
|
+
".scala": [
|
|
143
|
+
/^\s*(?:abstract\s+)?(?:class|trait|object|enum)\s+([A-Z]\w*)\b/gm,
|
|
144
|
+
/^\s*def\s+(\w+)/gm,
|
|
145
|
+
],
|
|
146
|
+
".dart": [
|
|
147
|
+
/^\s*(?:abstract\s+)?class\s+([A-Z]\w*)\b/gm,
|
|
148
|
+
/^\s*(?:[A-Z]\w*\s+)?(\w+)\s*\([^)]*\)\s*(?:async\s*)?(?:=>|\{)/gm,
|
|
149
|
+
],
|
|
150
|
+
".zig": [
|
|
151
|
+
/^\s*(?:pub\s+)?fn\s+(\w+)/gm,
|
|
152
|
+
/^\s*(?:pub\s+)?const\s+([A-Z]\w*)\s*=\s*struct\b/gm,
|
|
153
|
+
],
|
|
154
|
+
".nim": [
|
|
155
|
+
/^\s*proc\s+(\w+)/gm,
|
|
156
|
+
/^\s*type\s+(\w+)\b/gm,
|
|
157
|
+
],
|
|
158
|
+
// OCaml / F# / Haskell / Clojure — module-style declarations.
|
|
159
|
+
".ml": [
|
|
160
|
+
/^\s*module\s+([A-Z]\w*)\b/gm,
|
|
161
|
+
/^\s*let\s+(\w+)/gm,
|
|
162
|
+
],
|
|
163
|
+
".mli": [/^\s*module\s+([A-Z]\w*)\b/gm, /^\s*val\s+(\w+)/gm],
|
|
164
|
+
".fs": [
|
|
165
|
+
/^\s*module\s+([A-Z]\w*)\b/gm,
|
|
166
|
+
/^\s*let\s+(\w+)/gm,
|
|
167
|
+
/^\s*type\s+([A-Z]\w*)\b/gm,
|
|
168
|
+
],
|
|
169
|
+
".hs": [
|
|
170
|
+
/^\s*module\s+([A-Z][\w.]*)\s+(?:where|\()/gm,
|
|
171
|
+
/^([a-z]\w*)\s*::/gm, // top-level type signature
|
|
172
|
+
],
|
|
173
|
+
".clj": [/^\s*\(ns\s+([\w.-]+)/gm, /^\s*\(defn-?\s+([\w?!-]+)/gm],
|
|
174
|
+
// Visual Basic, Tcl, R — for completeness.
|
|
175
|
+
".vb": [
|
|
176
|
+
/^\s*(?:Public\s+|Private\s+|Friend\s+)?(?:Class|Module|Interface|Structure)\s+(\w+)/gim,
|
|
177
|
+
/^\s*(?:Public\s+|Private\s+)?(?:Sub|Function)\s+(\w+)/gim,
|
|
178
|
+
],
|
|
179
|
+
".tcl": [/^\s*proc\s+(\w+)/gm],
|
|
180
|
+
".r": [/^\s*(\w+)\s*<-\s*function\b/gm],
|
|
181
|
+
// Shell scripts — function definitions.
|
|
182
|
+
".sh": [
|
|
183
|
+
/^\s*(?:function\s+)?(\w+)\s*\(\s*\)\s*\{/gm,
|
|
184
|
+
],
|
|
185
|
+
".bash": [
|
|
186
|
+
/^\s*(?:function\s+)?(\w+)\s*\(\s*\)\s*\{/gm,
|
|
187
|
+
],
|
|
188
|
+
// ── Crystal — Ruby-family with explicit struct keyword ───────────
|
|
189
|
+
// Same surface as Ruby but adds `struct` and supports `def self.x`.
|
|
190
|
+
// We capture leaf names; the rarity gate filters out generic ones.
|
|
191
|
+
".cr": [
|
|
192
|
+
/^\s*class\s+([A-Z]\w*)/gm,
|
|
193
|
+
/^\s*struct\s+([A-Z]\w*)/gm,
|
|
194
|
+
/^\s*module\s+([A-Z]\w*)/gm,
|
|
195
|
+
/^\s*def\s+(?:self\.)?(\w+[!?=]?)/gm,
|
|
196
|
+
],
|
|
197
|
+
// ── Julia — function / module / struct (immutable + mutable) ─────
|
|
198
|
+
".jl": [
|
|
199
|
+
/^\s*function\s+(\w+)/gm,
|
|
200
|
+
/^\s*module\s+([A-Z]\w*)/gm,
|
|
201
|
+
/^\s*(?:mutable\s+)?struct\s+([A-Z]\w*)/gm,
|
|
202
|
+
/^\s*abstract\s+type\s+([A-Z]\w*)/gm,
|
|
203
|
+
],
|
|
204
|
+
// ── GraphQL — schema types, all the kinds the spec defines ───────
|
|
205
|
+
// SDL is line-oriented for declarations: each definition starts at
|
|
206
|
+
// column 0 with the kind keyword. We cover all six declaration
|
|
207
|
+
// forms. The DSL is a true definition language — types reference
|
|
208
|
+
// each other in field positions (`field: OtherType`), which our
|
|
209
|
+
// mention-based pass picks up downstream.
|
|
210
|
+
".graphql": [
|
|
211
|
+
/^\s*type\s+([A-Z]\w*)/gm,
|
|
212
|
+
/^\s*input\s+([A-Z]\w*)/gm,
|
|
213
|
+
/^\s*interface\s+([A-Z]\w*)/gm,
|
|
214
|
+
/^\s*enum\s+([A-Z]\w*)/gm,
|
|
215
|
+
/^\s*union\s+([A-Z]\w*)\s*=/gm,
|
|
216
|
+
/^\s*scalar\s+([A-Z]\w*)/gm,
|
|
217
|
+
],
|
|
218
|
+
".gql": [
|
|
219
|
+
/^\s*type\s+([A-Z]\w*)/gm,
|
|
220
|
+
/^\s*input\s+([A-Z]\w*)/gm,
|
|
221
|
+
/^\s*interface\s+([A-Z]\w*)/gm,
|
|
222
|
+
/^\s*enum\s+([A-Z]\w*)/gm,
|
|
223
|
+
/^\s*union\s+([A-Z]\w*)\s*=/gm,
|
|
224
|
+
/^\s*scalar\s+([A-Z]\w*)/gm,
|
|
225
|
+
],
|
|
226
|
+
// ── Protocol Buffers — message / service / enum + import paths ───
|
|
227
|
+
// Proto files have `import "other.proto";` lines that the path-
|
|
228
|
+
// resolved-string pass already picks up. Definitions here let the
|
|
229
|
+
// mention-based pass connect proto files that share types.
|
|
230
|
+
".proto": [
|
|
231
|
+
/^\s*message\s+([A-Z]\w*)/gm,
|
|
232
|
+
/^\s*service\s+([A-Z]\w*)/gm,
|
|
233
|
+
/^\s*enum\s+([A-Z]\w*)/gm,
|
|
234
|
+
],
|
|
235
|
+
// ── Thrift — struct / service / exception / enum ─────────────────
|
|
236
|
+
// Like proto but with exception types. Has `include "other.thrift"`
|
|
237
|
+
// import statements that the path pass picks up.
|
|
238
|
+
".thrift": [
|
|
239
|
+
/^\s*struct\s+([A-Z]\w*)/gm,
|
|
240
|
+
/^\s*service\s+([A-Z]\w*)/gm,
|
|
241
|
+
/^\s*enum\s+([A-Z]\w*)/gm,
|
|
242
|
+
/^\s*exception\s+([A-Z]\w*)/gm,
|
|
243
|
+
/^\s*union\s+([A-Z]\w*)/gm,
|
|
244
|
+
],
|
|
245
|
+
// ── Verilog / SystemVerilog ──────────────────────────────────────
|
|
246
|
+
// Verilog modules are lowercase by convention; can't reuse the
|
|
247
|
+
// capital-letter `module` pattern from Ruby/OCaml. Anchored by the
|
|
248
|
+
// trailing `#`, `(`, or `;` that follows a real module header.
|
|
249
|
+
".v": [
|
|
250
|
+
/^\s*module\s+(\w+)\s*[#(;]/gm,
|
|
251
|
+
],
|
|
252
|
+
".sv": [
|
|
253
|
+
/^\s*module\s+(\w+)\s*[#(;]/gm,
|
|
254
|
+
/^\s*(?:virtual\s+)?class\s+(\w+)\s*[#(;:]/gm,
|
|
255
|
+
/^\s*interface\s+(\w+)\s*[#(;]/gm,
|
|
256
|
+
/^\s*package\s+(\w+)\s*;/gm,
|
|
257
|
+
],
|
|
258
|
+
".vh": [/^\s*module\s+(\w+)\s*[#(;]/gm],
|
|
259
|
+
".svh": [
|
|
260
|
+
/^\s*(?:virtual\s+)?class\s+(\w+)\s*[#(;:]/gm,
|
|
261
|
+
/^\s*interface\s+(\w+)\s*[#(;]/gm,
|
|
262
|
+
/^\s*package\s+(\w+)\s*;/gm,
|
|
263
|
+
],
|
|
264
|
+
// ── VHDL ─────────────────────────────────────────────────────────
|
|
265
|
+
// VHDL is case-insensitive (canonically uppercase keywords). The
|
|
266
|
+
// `is`/`of` anchor at the end pins the definition shape and keeps
|
|
267
|
+
// these from matching Ruby/JS `entity` mentions etc.
|
|
268
|
+
".vhd": [
|
|
269
|
+
/^\s*entity\s+(\w+)\s+is\b/gim,
|
|
270
|
+
/^\s*architecture\s+(\w+)\s+of\s+\w+\s+is\b/gim,
|
|
271
|
+
/^\s*package\s+(\w+)\s+is\b/gim,
|
|
272
|
+
/^\s*configuration\s+(\w+)\s+of\b/gim,
|
|
273
|
+
],
|
|
274
|
+
".vhdl": [
|
|
275
|
+
/^\s*entity\s+(\w+)\s+is\b/gim,
|
|
276
|
+
/^\s*architecture\s+(\w+)\s+of\s+\w+\s+is\b/gim,
|
|
277
|
+
/^\s*package\s+(\w+)\s+is\b/gim,
|
|
278
|
+
/^\s*configuration\s+(\w+)\s+of\b/gim,
|
|
279
|
+
],
|
|
280
|
+
// ── COBOL ────────────────────────────────────────────────────────
|
|
281
|
+
// PROGRAM-ID is the canonical identifier of a COBOL program;
|
|
282
|
+
// section / paragraph names are too noisy to capture wholesale.
|
|
283
|
+
// The `\.` terminator after PROGRAM-ID is required by the
|
|
284
|
+
// grammar — keeps the pattern from matching prose.
|
|
285
|
+
".cob": [/^\s*PROGRAM-ID\.\s+([\w-]+)\s*\.?/gim],
|
|
286
|
+
".cbl": [/^\s*PROGRAM-ID\.\s+([\w-]+)\s*\.?/gim],
|
|
287
|
+
".cpy": [/^\s*PROGRAM-ID\.\s+([\w-]+)\s*\.?/gim],
|
|
288
|
+
// ── Fortran (modern: free-form .f90+) ────────────────────────────
|
|
289
|
+
// Fixed-form .f / .for is column-sensitive and not worth the
|
|
290
|
+
// complexity. Modern Fortran is line-anchored and well-served by
|
|
291
|
+
// these patterns.
|
|
292
|
+
".f90": [
|
|
293
|
+
/^\s*subroutine\s+(\w+)/gim,
|
|
294
|
+
/^\s*(?:[\w\s(),:*]+?\s+)?function\s+(\w+)\s*\(/gim,
|
|
295
|
+
/^\s*module\s+(\w+)/gim,
|
|
296
|
+
/^\s*program\s+(\w+)/gim,
|
|
297
|
+
],
|
|
298
|
+
".f95": [
|
|
299
|
+
/^\s*subroutine\s+(\w+)/gim,
|
|
300
|
+
/^\s*(?:[\w\s(),:*]+?\s+)?function\s+(\w+)\s*\(/gim,
|
|
301
|
+
/^\s*module\s+(\w+)/gim,
|
|
302
|
+
/^\s*program\s+(\w+)/gim,
|
|
303
|
+
],
|
|
304
|
+
".f03": [
|
|
305
|
+
/^\s*subroutine\s+(\w+)/gim,
|
|
306
|
+
/^\s*module\s+(\w+)/gim,
|
|
307
|
+
],
|
|
308
|
+
".f08": [
|
|
309
|
+
/^\s*subroutine\s+(\w+)/gim,
|
|
310
|
+
/^\s*module\s+(\w+)/gim,
|
|
311
|
+
],
|
|
312
|
+
// ── Solidity ─────────────────────────────────────────────────────
|
|
313
|
+
// `contract` / `interface` / `library` are the three top-level
|
|
314
|
+
// declaration kinds. Existing `import "..."` pattern already covers
|
|
315
|
+
// Solidity imports; no new import pattern needed.
|
|
316
|
+
".sol": [
|
|
317
|
+
/^\s*(?:abstract\s+)?contract\s+([A-Z]\w*)/gm,
|
|
318
|
+
/^\s*interface\s+([A-Z]\w*)/gm,
|
|
319
|
+
/^\s*library\s+([A-Z]\w*)/gm,
|
|
320
|
+
/^\s*function\s+(\w+)\s*\(/gm,
|
|
321
|
+
],
|
|
322
|
+
// ── Vim script ───────────────────────────────────────────────────
|
|
323
|
+
// `function!` is the redefining form, `function` the strict one;
|
|
324
|
+
// both name a function. Optional scope prefix (`s:`, `g:`, `b:`,
|
|
325
|
+
// `w:`, `t:`) is stripped from the capture so the name is the leaf
|
|
326
|
+
// — matches how a `:call MyFunc()` reference appears.
|
|
327
|
+
".vim": [/^\s*function!?\s+(?:[sgbwt]:)?(\w+)/gm],
|
|
328
|
+
// ── D ───────────────────────────────────────────────────────────
|
|
329
|
+
// D is C-family with a distinct `module` declaration. The Java/C++
|
|
330
|
+
// class/struct/interface patterns under GENERIC handle the rest;
|
|
331
|
+
// here we just add the module decl so `module pkg.thing;` files
|
|
332
|
+
// are picked up. D's `import` is covered by the existing JS-style
|
|
333
|
+
// import pattern.
|
|
334
|
+
".d": [
|
|
335
|
+
/^\s*module\s+([\w.]+)\s*;/gm,
|
|
336
|
+
/^\s*(?:public\s+|private\s+)?(?:class|struct|interface|enum|template)\s+([A-Z]\w*)/gm,
|
|
337
|
+
],
|
|
338
|
+
// ── Smalltalk ────────────────────────────────────────────────────
|
|
339
|
+
// The canonical "class A subclass: #B" form. Other Smalltalk
|
|
340
|
+
// dialects (Pharo class definitions across multiple lines) are
|
|
341
|
+
// out of scope for a regex pass — too easy to FP. This single
|
|
342
|
+
// pattern is high-precision.
|
|
343
|
+
".st": [/^[A-Z][\w]*\s+subclass:\s*#(\w+)/gm],
|
|
344
|
+
// ── Racket / Scheme / Common Lisp ────────────────────────────────
|
|
345
|
+
// Parenthesised forms; we anchor at start of an open paren on the
|
|
346
|
+
// line. Lisp identifiers allow many extra chars (`!`, `?`, `+`,
|
|
347
|
+
// `-`, `*`, `/`, `=`, `<`, `>`). Multi-line definitions are common
|
|
348
|
+
// but the form keyword + name is on the first line — that's what
|
|
349
|
+
// we capture.
|
|
350
|
+
".rkt": [
|
|
351
|
+
/^\s*\(define(?:-struct)?\s+\(?([\w!?+\-*/=<>]+)/gm,
|
|
352
|
+
/^\s*\(provide\s+([\w!?+\-*/=<>]+)/gm,
|
|
353
|
+
],
|
|
354
|
+
".scm": [/^\s*\(define\s+\(?([\w!?+\-*/=<>]+)/gm],
|
|
355
|
+
".ss": [/^\s*\(define\s+\(?([\w!?+\-*/=<>]+)/gm],
|
|
356
|
+
".lisp": [
|
|
357
|
+
/^\s*\(defun\s+([\w!?+\-*/=<>]+)/gm,
|
|
358
|
+
/^\s*\(defmacro\s+([\w!?+\-*/=<>]+)/gm,
|
|
359
|
+
/^\s*\(defclass\s+([\w!?+\-*/=<>]+)/gm,
|
|
360
|
+
/^\s*\(defstruct\s+([\w!?+\-*/=<>]+)/gm,
|
|
361
|
+
/^\s*\(defpackage\s+:?([\w!?+\-*/=<>]+)/gm,
|
|
362
|
+
],
|
|
363
|
+
".cl": [
|
|
364
|
+
/^\s*\(defun\s+([\w!?+\-*/=<>]+)/gm,
|
|
365
|
+
/^\s*\(defclass\s+([\w!?+\-*/=<>]+)/gm,
|
|
366
|
+
],
|
|
367
|
+
// ── Modula-2 ─────────────────────────────────────────────────────
|
|
368
|
+
// Modula-2 keywords are uppercase by convention. The MODULE
|
|
369
|
+
// declaration is the canonical identifier; PROCEDUREs inside are
|
|
370
|
+
// captured too. Niche but the user explicitly asked for languages
|
|
371
|
+
// without reliable tree-sitter grammars.
|
|
372
|
+
".mod": [
|
|
373
|
+
/^\s*MODULE\s+(\w+)\s*;/gm,
|
|
374
|
+
/^\s*PROCEDURE\s+(\w+)/gm,
|
|
375
|
+
],
|
|
376
|
+
".m2": [
|
|
377
|
+
/^\s*MODULE\s+(\w+)\s*;/gm,
|
|
378
|
+
/^\s*PROCEDURE\s+(\w+)/gm,
|
|
379
|
+
],
|
|
380
|
+
// ── Ada ──────────────────────────────────────────────────────────
|
|
381
|
+
// `package Foo is` and `procedure Foo is`. Existing `with Foo;`
|
|
382
|
+
// import line pattern would cover Ada imports if we added one;
|
|
383
|
+
// skipped for now — Ada is rare enough that demand will surface
|
|
384
|
+
// the need before we speculate.
|
|
385
|
+
".adb": [
|
|
386
|
+
/^\s*package(?:\s+body)?\s+([\w.]+)\s+is\b/gim,
|
|
387
|
+
/^\s*procedure\s+([\w.]+)/gim,
|
|
388
|
+
],
|
|
389
|
+
".ads": [
|
|
390
|
+
/^\s*package(?:\s+body)?\s+([\w.]+)\s+is\b/gim,
|
|
391
|
+
/^\s*procedure\s+([\w.]+)/gim,
|
|
392
|
+
/^\s*function\s+(\w+)/gim,
|
|
393
|
+
],
|
|
394
|
+
};
|
|
395
|
+
/** Languages we DON'T have a dedicated pattern set for: try a small
|
|
396
|
+
* generic set so we still get something. Conservative — only the
|
|
397
|
+
* most universally-marked forms. */
|
|
398
|
+
const GENERIC_DEFINITION_PATTERNS = [
|
|
399
|
+
/^\s*(?:public\s+|private\s+|protected\s+|abstract\s+|static\s+)*(?:class|interface|trait|enum|struct)\s+([A-Z]\w*)\b/gm,
|
|
400
|
+
/^\s*(?:def|func|fn|function|defn|defmodule)\s+(\w+)/gm,
|
|
401
|
+
];
|
|
402
|
+
/* ── import-line patterns ──────────────────────────────────────────── */
|
|
403
|
+
/*
|
|
404
|
+
* Each pattern captures a string that names a module or file path the
|
|
405
|
+
* importing file depends on. The resolver below tries to map the
|
|
406
|
+
* captured text to an actual file under the project root.
|
|
407
|
+
*/
|
|
408
|
+
const IMPORT_PATTERNS = [
|
|
409
|
+
// Python: `from foo.bar import X` (capture `foo.bar`)
|
|
410
|
+
/^\s*from\s+([\w.]+)\s+import\b/gm,
|
|
411
|
+
// Python / Java / Go / Kotlin: `import foo.bar`
|
|
412
|
+
/^\s*import\s+(?:[\w*{}\s,]+from\s+)?["']?([\w./@-]+)["']?/gm,
|
|
413
|
+
// Ruby: `require 'foo'` and `require_relative './foo'`
|
|
414
|
+
/^\s*require(?:_relative)?\s+["']([^"']+)["']/gm,
|
|
415
|
+
// Rust / Elixir / Perl / PHP: `use foo::Bar`
|
|
416
|
+
/^\s*use\s+([\w:.]+)/gm,
|
|
417
|
+
// C / C++: `#include "foo.h"` or `<foo.h>`
|
|
418
|
+
/^\s*#\s*include\s+[<"]([^>"]+)[>"]/gm,
|
|
419
|
+
// Pascal: `uses Math, SysUtils, MyUnit;` — multiple names in one line
|
|
420
|
+
/^\s*uses\s+([\w\s,]+);/gim,
|
|
421
|
+
// OCaml / F#: `open Foo`
|
|
422
|
+
/^\s*open\s+([\w.]+)/gm,
|
|
423
|
+
// Lua: `require 'foo'`
|
|
424
|
+
/^\s*(?:local\s+\w+\s*=\s*)?require\s*\(?\s*["']([^"']+)["']/gm,
|
|
425
|
+
// Elixir: `alias MyApp.Module` and `import MyApp.Module`
|
|
426
|
+
/^\s*alias\s+([\w.]+)/gm,
|
|
427
|
+
// Shell: `source path/to/script.sh` and the `.` synonym.
|
|
428
|
+
// The trailing path can be quoted or bare.
|
|
429
|
+
/^\s*(?:source|\.)\s+["']?([^"'\s]+)["']?/gm,
|
|
430
|
+
// Verilog / SystemVerilog: `` `include "file.v" ``
|
|
431
|
+
// Backtick-prefix is unique to Verilog preprocessor directives —
|
|
432
|
+
// can't be confused with any other language. Captures the path.
|
|
433
|
+
/^\s*`include\s+["<]([^>"]+)[>"]/gm,
|
|
434
|
+
// COBOL: `COPY copybook.` or `COPY copybook.cpy.`
|
|
435
|
+
// The trailing period is COBOL's statement terminator — required
|
|
436
|
+
// by the grammar so this won't FP-match prose. Case-insensitive
|
|
437
|
+
// because real COBOL mixes cases despite convention.
|
|
438
|
+
/^\s*COPY\s+["']?([\w.-]+)["']?\s*\./gim,
|
|
439
|
+
// Vim script: `:runtime path/to/file.vim` (`!` is the bang variant)
|
|
440
|
+
// — the editor's analogue of `source` for files under the runtimepath.
|
|
441
|
+
/^\s*:?\s*runtime!?\s+(?:[\w/]+\s+)*([\w./~-]+\.vim)\b/gm,
|
|
442
|
+
];
|
|
443
|
+
/* ── path-resolvable string detection (config files) ──────────────── */
|
|
444
|
+
const CONFIG_EXTS = new Set([
|
|
445
|
+
".json", ".jsonc", ".json5",
|
|
446
|
+
".yml", ".yaml",
|
|
447
|
+
".toml",
|
|
448
|
+
// Terraform / OpenTofu — `source = "../modules/x"` is a path-string
|
|
449
|
+
// reference of exactly the kind this ingester catches. Treated as
|
|
450
|
+
// config (not source) since we don't extract HCL definitions.
|
|
451
|
+
".tf", ".tfvars",
|
|
452
|
+
// Older Unix-style config files — INI sections rarely reference
|
|
453
|
+
// paths but when they do (`include=…`) we want to catch it.
|
|
454
|
+
".ini", ".cfg", ".conf",
|
|
455
|
+
]);
|
|
456
|
+
export async function ingestCrossRefs(repo, root, opts = {}) {
|
|
457
|
+
const rarityThreshold = Math.max(1, Math.round(opts.rarityThreshold ?? 3));
|
|
458
|
+
const maxFilesLimit = Math.max(1, Math.round(opts.maxFiles ?? MAX_FILES));
|
|
459
|
+
const maxEdgesLimit = Math.max(1, Math.round(opts.maxEdges ?? MAX_EDGES_TOTAL));
|
|
460
|
+
// ── Walk: collect all candidate files ──────────────────────────────
|
|
461
|
+
const allFiles = await collectFiles(root, maxFilesLimit);
|
|
462
|
+
// ── Pass 1: definitions per file → identifier → defining-files map ─
|
|
463
|
+
const defs = new Map();
|
|
464
|
+
// identifier → list of (file, line) for filename-coupling resolution
|
|
465
|
+
// (we keep the file; the line isn't needed for the edge logic).
|
|
466
|
+
let totalDefs = 0;
|
|
467
|
+
for (const f of allFiles) {
|
|
468
|
+
const ext = extname(f.rel).toLowerCase();
|
|
469
|
+
const patterns = DEFINITION_PATTERNS[ext] ?? GENERIC_DEFINITION_PATTERNS;
|
|
470
|
+
let perFileDefs = 0;
|
|
471
|
+
for (const pat of patterns) {
|
|
472
|
+
for (const m of f.content.matchAll(pat)) {
|
|
473
|
+
const ident = m[1];
|
|
474
|
+
if (!ident || !isUsefulIdentifier(ident))
|
|
475
|
+
continue;
|
|
476
|
+
if (!defs.has(ident))
|
|
477
|
+
defs.set(ident, new Set());
|
|
478
|
+
const set = defs.get(ident);
|
|
479
|
+
if (!set.has(f.rel)) {
|
|
480
|
+
set.add(f.rel);
|
|
481
|
+
perFileDefs += 1;
|
|
482
|
+
totalDefs += 1;
|
|
483
|
+
}
|
|
484
|
+
// For dotted (Elixir, Python) or double-colon (Perl) namespaced
|
|
485
|
+
// identifiers, ALSO register the trailing segment as defined
|
|
486
|
+
// here. Real-world reference sites use the trailing segment
|
|
487
|
+
// alone after an `alias`/`use` line:
|
|
488
|
+
//
|
|
489
|
+
// defmodule MyApp.Auth do ... ← lib/auth.ex
|
|
490
|
+
// alias MyApp.Auth ← lib/router.ex
|
|
491
|
+
// Auth.verify(token) ← uses bare "Auth"
|
|
492
|
+
//
|
|
493
|
+
// Without this, `Auth` is never in `defs` and the corroboration
|
|
494
|
+
// pass can't connect router.ex → auth.ex via filename coupling.
|
|
495
|
+
// Filename coupling + rarity is what keeps FP low.
|
|
496
|
+
if (/[.:]/.test(ident)) {
|
|
497
|
+
const segments = ident.split(/[.:]+/).filter((s) => s.length > 0);
|
|
498
|
+
const last = segments[segments.length - 1];
|
|
499
|
+
if (last && last !== ident && isUsefulIdentifier(last)) {
|
|
500
|
+
if (!defs.has(last))
|
|
501
|
+
defs.set(last, new Set());
|
|
502
|
+
const set2 = defs.get(last);
|
|
503
|
+
if (!set2.has(f.rel)) {
|
|
504
|
+
set2.add(f.rel);
|
|
505
|
+
totalDefs += 1;
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
if (perFileDefs >= MAX_DEFS_PER_FILE)
|
|
510
|
+
break;
|
|
511
|
+
}
|
|
512
|
+
if (perFileDefs >= MAX_DEFS_PER_FILE)
|
|
513
|
+
break;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
// ── Edge accumulator. Same (src, tgt) merges. ──────────────────────
|
|
517
|
+
const edges = new Map();
|
|
518
|
+
const fileSet = new Set(allFiles.map((f) => f.rel));
|
|
519
|
+
// ── Pass 2: import-line path resolution ────────────────────────────
|
|
520
|
+
// The captured module/path is normalised to candidate relative paths
|
|
521
|
+
// and checked against fileSet. Existence is the gate.
|
|
522
|
+
for (const f of allFiles) {
|
|
523
|
+
for (const pat of IMPORT_PATTERNS) {
|
|
524
|
+
for (const m of f.content.matchAll(pat)) {
|
|
525
|
+
const raw = m[1];
|
|
526
|
+
if (!raw)
|
|
527
|
+
continue;
|
|
528
|
+
// Pascal `uses` lists are comma-separated.
|
|
529
|
+
const names = raw.includes(",") ? raw.split(",").map((s) => s.trim()) : [raw.trim()];
|
|
530
|
+
for (const name of names) {
|
|
531
|
+
const candidates = resolveImportToFiles(name, f.rel, fileSet);
|
|
532
|
+
for (const tgt of candidates) {
|
|
533
|
+
if (tgt === f.rel)
|
|
534
|
+
continue;
|
|
535
|
+
addEdge(edges, f.rel, tgt, "import-resolved", maxEdgesLimit);
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
// ── Pass 3: config-path-strings ────────────────────────────────────
|
|
542
|
+
for (const f of allFiles) {
|
|
543
|
+
if (!CONFIG_EXTS.has(extname(f.rel).toLowerCase()))
|
|
544
|
+
continue;
|
|
545
|
+
// Best-effort JSON parse for .json / .jsonc / .json5. YAML/TOML
|
|
546
|
+
// get a regex-based string-value extractor — full parsers would
|
|
547
|
+
// add a multi-MB dep for a small precision gain; we already gate
|
|
548
|
+
// on filesystem existence so a mis-extracted string is silently
|
|
549
|
+
// dropped, not a FP.
|
|
550
|
+
const stringValues = extractStringsFromConfig(f.content, extname(f.rel).toLowerCase());
|
|
551
|
+
for (const sv of stringValues) {
|
|
552
|
+
const tgt = resolveConfigPath(sv, f.rel, fileSet);
|
|
553
|
+
if (tgt && tgt !== f.rel) {
|
|
554
|
+
addEdge(edges, f.rel, tgt, "config-path", maxEdgesLimit);
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
// ── Pass 4: corroborated identifier mentions ───────────────────────
|
|
559
|
+
// For each file F, tokenise line-by-line. For each token that is
|
|
560
|
+
// defined elsewhere AND is rarity-gated, check the corroboration
|
|
561
|
+
// signals before emitting.
|
|
562
|
+
//
|
|
563
|
+
// Performance: each file is tokenised once. Each token is a hashmap
|
|
564
|
+
// lookup against `defs`. O(N × tokens_per_file).
|
|
565
|
+
for (const f of allFiles) {
|
|
566
|
+
if (edges.size >= maxEdgesLimit)
|
|
567
|
+
break;
|
|
568
|
+
const lines = f.content.split("\n");
|
|
569
|
+
const ext = extname(f.rel).toLowerCase();
|
|
570
|
+
let tokensSeen = 0;
|
|
571
|
+
for (let i = 0; i < lines.length; i++) {
|
|
572
|
+
if (tokensSeen >= MAX_TOKENS_PER_FILE)
|
|
573
|
+
break;
|
|
574
|
+
const line = lines[i];
|
|
575
|
+
// Skip comment-only lines so identifier mentions inside
|
|
576
|
+
// comments don't fire the corroboration gate. The trimmed-line
|
|
577
|
+
// check means inline trailing comments are still kept — we only
|
|
578
|
+
// skip when the WHOLE line is a comment.
|
|
579
|
+
if (lineIsCommentOnly(line, ext))
|
|
580
|
+
continue;
|
|
581
|
+
const isImportLine = lineLooksLikeImport(line);
|
|
582
|
+
// When an import-like line contains a literal path string
|
|
583
|
+
// (e.g. shell `source ./lib.sh`, Ruby `load './script.rb'`,
|
|
584
|
+
// shell `. /etc/foo.sh`), the path is a stronger edge signal
|
|
585
|
+
// than identifier matching — it's filesystem-grounded.
|
|
586
|
+
// Extract any `./…`, `../…`, or `/…` substring from the line,
|
|
587
|
+
// resolve against fileSet, and emit an edge if it lands on a
|
|
588
|
+
// real file. This is INDEPENDENT of the identifier rarity gate
|
|
589
|
+
// because the existence-on-disk check is the gate.
|
|
590
|
+
if (isImportLine) {
|
|
591
|
+
// Two alternatives OR'd together:
|
|
592
|
+
// (1) UNQUOTED leading-slash paths — `source ./lib.sh`,
|
|
593
|
+
// `. /etc/init.d/foo.sh`. Must start with `./`, `../`,
|
|
594
|
+
// or `/` (else any identifier could match).
|
|
595
|
+
// (2) QUOTED paths — `source('lib/stats.R')`,
|
|
596
|
+
// `load "vendor/x.rb"`. Inside `'…'`/`"…"`/backticks,
|
|
597
|
+
// any non-whitespace sequence ending in `.<ext>`
|
|
598
|
+
// qualifies; the filesystem-existence gate grounds it.
|
|
599
|
+
const pathInImport = /(?:^|[\s,(=])(\.{0,2}\/[\w./-]+\.\w{1,8})(?=[\s,;)`'"]|$)|['"`]([^'"`\s\\]{2,200}\.\w{1,8})['"`]/g;
|
|
600
|
+
let pm;
|
|
601
|
+
while ((pm = pathInImport.exec(line)) !== null) {
|
|
602
|
+
const cand = pm[1] ?? pm[2];
|
|
603
|
+
if (!cand)
|
|
604
|
+
continue;
|
|
605
|
+
const resolved = resolveConfigPath(cand, f.rel, fileSet);
|
|
606
|
+
if (resolved && resolved !== f.rel) {
|
|
607
|
+
addEdge(edges, f.rel, resolved, "import-path", maxEdgesLimit);
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
// word-shape tokens; skip everything else.
|
|
612
|
+
// Pre-split saves repeated regex work versus matchAll on every iteration.
|
|
613
|
+
const tokens = line.split(/[^\w:]+/).filter((t) => t.length > 0);
|
|
614
|
+
for (const tok of tokens) {
|
|
615
|
+
tokensSeen += 1;
|
|
616
|
+
if (tokensSeen >= MAX_TOKENS_PER_FILE)
|
|
617
|
+
break;
|
|
618
|
+
const definingFiles = defs.get(tok);
|
|
619
|
+
if (!definingFiles || definingFiles.size === 0)
|
|
620
|
+
continue;
|
|
621
|
+
// RARITY GATE: identifier defined in too many files is noise.
|
|
622
|
+
if (definingFiles.size > rarityThreshold)
|
|
623
|
+
continue;
|
|
624
|
+
for (const definedIn of definingFiles) {
|
|
625
|
+
if (definedIn === f.rel)
|
|
626
|
+
continue;
|
|
627
|
+
// CORROBORATION: import-line context OR filename-coupling.
|
|
628
|
+
const couples = filenameCouples(definedIn, tok);
|
|
629
|
+
if (!isImportLine && !couples)
|
|
630
|
+
continue;
|
|
631
|
+
const evidence = isImportLine
|
|
632
|
+
? (couples ? "import+filename" : "import+identifier")
|
|
633
|
+
: "filename+identifier";
|
|
634
|
+
addEdge(edges, f.rel, definedIn, evidence, maxEdgesLimit);
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
// ── Emit edges as memories ─────────────────────────────────────────
|
|
640
|
+
const byEvidence = {};
|
|
641
|
+
let emitted = 0;
|
|
642
|
+
for (const [, ev] of edges) {
|
|
643
|
+
if (emitted >= maxEdgesLimit)
|
|
644
|
+
break;
|
|
645
|
+
const primary = pickPrimaryEvidence(ev.evidences);
|
|
646
|
+
byEvidence[primary] = (byEvidence[primary] ?? 0) + 1;
|
|
647
|
+
const evList = Array.from(ev.evidences).sort().join(" + ");
|
|
648
|
+
repo.insertIfMissing({
|
|
649
|
+
category: CATEGORY,
|
|
650
|
+
subject: `xref:${ev.src}->${ev.tgt}`,
|
|
651
|
+
content: `${ev.src} references ${ev.tgt} (evidence: ${evList}). ` +
|
|
652
|
+
`Cross-reference inferred without a language grammar; treat as a navigation hint.`,
|
|
653
|
+
tags: ["xref", "cross-reference", primary, ...basenameTagsForPair(ev.src, ev.tgt)],
|
|
654
|
+
source: "cross-refs-ingest",
|
|
655
|
+
});
|
|
656
|
+
emitted += 1;
|
|
657
|
+
}
|
|
658
|
+
return {
|
|
659
|
+
filesWalked: allFiles.length,
|
|
660
|
+
definitionsExtracted: totalDefs,
|
|
661
|
+
edgesEmitted: emitted,
|
|
662
|
+
byEvidence,
|
|
663
|
+
};
|
|
664
|
+
}
|
|
665
|
+
async function collectFiles(root, maxFiles = MAX_FILES) {
|
|
666
|
+
const out = [];
|
|
667
|
+
const stack = [root];
|
|
668
|
+
while (stack.length > 0 && out.length < maxFiles) {
|
|
669
|
+
const dir = stack.pop();
|
|
670
|
+
let entries;
|
|
671
|
+
try {
|
|
672
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
673
|
+
}
|
|
674
|
+
catch {
|
|
675
|
+
continue;
|
|
676
|
+
}
|
|
677
|
+
for (const e of entries) {
|
|
678
|
+
if (e.name.startsWith(".") && !e.name.startsWith(".github") && !e.name.startsWith(".gitlab"))
|
|
679
|
+
continue;
|
|
680
|
+
const abs = join(dir, e.name);
|
|
681
|
+
if (e.isDirectory()) {
|
|
682
|
+
if (!SKIP_DIRS.has(e.name))
|
|
683
|
+
stack.push(abs);
|
|
684
|
+
continue;
|
|
685
|
+
}
|
|
686
|
+
if (!e.isFile())
|
|
687
|
+
continue;
|
|
688
|
+
// Filter by what we actually do something with: definition-
|
|
689
|
+
// patterns key set ∪ config exts ∪ a few extras for the
|
|
690
|
+
// "generic" pattern fallback. Exhaustive list would be 100+
|
|
691
|
+
// extensions; this captures the common cases we want to walk.
|
|
692
|
+
const ext = extname(e.name).toLowerCase();
|
|
693
|
+
if (!shouldWalkPath(e.name, ext))
|
|
694
|
+
continue;
|
|
695
|
+
let s;
|
|
696
|
+
try {
|
|
697
|
+
s = await stat(abs);
|
|
698
|
+
}
|
|
699
|
+
catch {
|
|
700
|
+
continue;
|
|
701
|
+
}
|
|
702
|
+
if (!s.isFile() || s.size === 0 || s.size > MAX_FILE_BYTES)
|
|
703
|
+
continue;
|
|
704
|
+
let content;
|
|
705
|
+
try {
|
|
706
|
+
content = await readFile(abs, "utf-8");
|
|
707
|
+
}
|
|
708
|
+
catch {
|
|
709
|
+
continue;
|
|
710
|
+
}
|
|
711
|
+
if (content.indexOf("\0") >= 0)
|
|
712
|
+
continue; // binary
|
|
713
|
+
const rel = relative(root, abs).split(sep).join("/");
|
|
714
|
+
out.push({ abs, rel, content });
|
|
715
|
+
if (out.length >= maxFiles)
|
|
716
|
+
break;
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
return out;
|
|
720
|
+
}
|
|
721
|
+
/** No-extension filenames worth walking — Docker/Makefile/Ruby
|
|
722
|
+
* ecosystem files that other parts of the repo legitimately reference
|
|
723
|
+
* by name. Recognising them here lets the directory-resolution
|
|
724
|
+
* branch of `resolveConfigPath` find them as targets (e.g. Docker
|
|
725
|
+
* Compose `build: services/api` → `services/api/Dockerfile`). */
|
|
726
|
+
const NO_EXTENSION_BASENAMES = new Set([
|
|
727
|
+
"Dockerfile",
|
|
728
|
+
"Containerfile",
|
|
729
|
+
"Makefile",
|
|
730
|
+
"Rakefile",
|
|
731
|
+
"Gemfile",
|
|
732
|
+
"Vagrantfile",
|
|
733
|
+
"Procfile",
|
|
734
|
+
"Brewfile",
|
|
735
|
+
"Justfile",
|
|
736
|
+
]);
|
|
737
|
+
function shouldWalkPath(basenameStr, ext) {
|
|
738
|
+
if (shouldWalkExtension(ext))
|
|
739
|
+
return true;
|
|
740
|
+
if (NO_EXTENSION_BASENAMES.has(basenameStr))
|
|
741
|
+
return true;
|
|
742
|
+
return false;
|
|
743
|
+
}
|
|
744
|
+
function shouldWalkExtension(ext) {
|
|
745
|
+
if (DEFINITION_PATTERNS[ext])
|
|
746
|
+
return true;
|
|
747
|
+
if (CONFIG_EXTS.has(ext))
|
|
748
|
+
return true;
|
|
749
|
+
// Generic-pattern extensions — languages we don't have a dedicated
|
|
750
|
+
// pattern set for but the generic patterns still catch some defs.
|
|
751
|
+
const extras = new Set([
|
|
752
|
+
".java", ".kt", ".kts", ".scala", ".dart", ".cs", ".vb", ".php",
|
|
753
|
+
".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
|
|
754
|
+
".py", ".pyi", ".go", ".rs", ".c", ".h", ".cc", ".cpp", ".hpp",
|
|
755
|
+
]);
|
|
756
|
+
return extras.has(ext);
|
|
757
|
+
}
|
|
758
|
+
function isUsefulIdentifier(s) {
|
|
759
|
+
// 3-50 chars; reject very generic names that produce massive FP.
|
|
760
|
+
if (s.length < 3 || s.length > 50)
|
|
761
|
+
return false;
|
|
762
|
+
const GENERIC = new Set([
|
|
763
|
+
"self", "this", "new", "true", "false", "nil", "null", "none",
|
|
764
|
+
"var", "let", "const", "def", "fn", "func", "function", "return",
|
|
765
|
+
"end", "begin", "if", "else", "for", "while", "do", "in", "of",
|
|
766
|
+
"and", "or", "not", "module", "class", "type", "value", "name",
|
|
767
|
+
"data", "item", "list", "map", "set", "get", "put", "key", "val",
|
|
768
|
+
"main", "init", "run", "test", "spec", "src", "lib", "app",
|
|
769
|
+
]);
|
|
770
|
+
return !GENERIC.has(s.toLowerCase());
|
|
771
|
+
}
|
|
772
|
+
/** Decide whether a single source line looks like an import/require/use
|
|
773
|
+
* statement in ANY language. Conservative — false negatives are fine
|
|
774
|
+
* (the filename-coupling signal picks up the slack); false positives
|
|
775
|
+
* here cost us in the corroboration gate. */
|
|
776
|
+
/**
|
|
777
|
+
* Per-extension line-comment prefixes used to skip pure-comment lines
|
|
778
|
+
* during the mention scan. A line whose trimmed content starts with
|
|
779
|
+
* any of these prefixes is not tokenised — preventing the FP where
|
|
780
|
+
* a token like `alu` mentioned in a `; comment` in a Lisp file would
|
|
781
|
+
* filename-couple to `alu.v` and emit a spurious edge.
|
|
782
|
+
*
|
|
783
|
+
* Conservative — we only list extensions where the prefix is
|
|
784
|
+
* unambiguously a comment marker. Languages like JS/Java where `//`
|
|
785
|
+
* is a comment AND `/` is a path separator are listed; the trimmed-
|
|
786
|
+
* line check (`line.trimStart().startsWith(prefix)`) means inline
|
|
787
|
+
* comments preceded by code are NOT stripped, only comment-only
|
|
788
|
+
* lines are skipped.
|
|
789
|
+
*/
|
|
790
|
+
const LINE_COMMENT_PREFIXES = {
|
|
791
|
+
// Lisp family — the FP that motivated this table.
|
|
792
|
+
".lisp": [";"], ".cl": [";"], ".lsp": [";"],
|
|
793
|
+
".rkt": [";"], ".scm": [";"], ".ss": [";"], ".clj": [";"], ".cljs": [";"],
|
|
794
|
+
// `;` is also a comment marker in assembly + some Scheme dialects.
|
|
795
|
+
".asm": [";"], ".s": [";"],
|
|
796
|
+
// # — Python, Ruby, Perl, shell, YAML, TOML, R, Tcl, Nim, Crystal, Elixir.
|
|
797
|
+
".py": ["#"], ".pyi": ["#"],
|
|
798
|
+
".rb": ["#"], ".pl": ["#"], ".pm": ["#"],
|
|
799
|
+
".sh": ["#"], ".bash": ["#"], ".zsh": ["#"],
|
|
800
|
+
".yml": ["#"], ".yaml": ["#"], ".toml": ["#"],
|
|
801
|
+
".r": ["#"], ".tcl": ["#"], ".nim": ["#"], ".cr": ["#"],
|
|
802
|
+
".ex": ["#"], ".exs": ["#"],
|
|
803
|
+
// C family — // (block /* */ handled by stripping in regex too)
|
|
804
|
+
".js": ["//"], ".jsx": ["//"], ".mjs": ["//"], ".cjs": ["//"],
|
|
805
|
+
".ts": ["//"], ".tsx": ["//"], ".mts": ["//"], ".cts": ["//"],
|
|
806
|
+
".c": ["//"], ".h": ["//"], ".cpp": ["//"], ".cc": ["//"], ".cxx": ["//"], ".hpp": ["//"], ".hxx": ["//"],
|
|
807
|
+
".java": ["//"], ".kt": ["//"], ".kts": ["//"], ".scala": ["//"],
|
|
808
|
+
".swift": ["//"], ".dart": ["//"], ".cs": ["//"],
|
|
809
|
+
".go": ["//"], ".rs": ["//"], ".sol": ["//"], ".d": ["//"], ".zig": ["//"],
|
|
810
|
+
".php": ["//", "#"], // PHP supports both
|
|
811
|
+
".v": ["//"], ".sv": ["//"], ".vh": ["//"], ".svh": ["//"],
|
|
812
|
+
// -- in SQL, Haskell, Ada, VHDL, Lua
|
|
813
|
+
".sql": ["--"], ".hs": ["--"], ".lhs": ["--"],
|
|
814
|
+
".lua": ["--"], ".adb": ["--"], ".ads": ["--"],
|
|
815
|
+
".vhd": ["--"], ".vhdl": ["--"],
|
|
816
|
+
// % in Erlang, MATLAB
|
|
817
|
+
".erl": ["%"], ".hrl": ["%"], ".m": ["%"],
|
|
818
|
+
// ' in VB
|
|
819
|
+
".vb": ["'"],
|
|
820
|
+
// " in vim script
|
|
821
|
+
".vim": ['"'],
|
|
822
|
+
// // in modern Pascal dialects (FPC/Delphi)
|
|
823
|
+
".pas": ["//"], ".pp": ["//"], ".dpr": ["//"], ".lpr": ["//"],
|
|
824
|
+
// COBOL — *> in free-form, * in fixed-form (col 7). Trimmed-line
|
|
825
|
+
// start with these covers both.
|
|
826
|
+
".cob": ["*>", "*"], ".cbl": ["*>", "*"], ".cpy": ["*>", "*"],
|
|
827
|
+
// Fortran — ! for free-form .f90+
|
|
828
|
+
".f90": ["!"], ".f95": ["!"], ".f03": ["!"], ".f08": ["!"],
|
|
829
|
+
};
|
|
830
|
+
function lineIsCommentOnly(line, ext) {
|
|
831
|
+
const prefixes = LINE_COMMENT_PREFIXES[ext];
|
|
832
|
+
if (!prefixes)
|
|
833
|
+
return false;
|
|
834
|
+
const trimmed = line.trimStart();
|
|
835
|
+
if (trimmed.length === 0)
|
|
836
|
+
return false;
|
|
837
|
+
for (const p of prefixes) {
|
|
838
|
+
if (trimmed.startsWith(p))
|
|
839
|
+
return true;
|
|
840
|
+
}
|
|
841
|
+
return false;
|
|
842
|
+
}
|
|
843
|
+
function lineLooksLikeImport(line) {
|
|
844
|
+
// Verbs that introduce an import-style cross-file reference in any
|
|
845
|
+
// language we support. `alias` covers Elixir's `alias Foo.Bar`;
|
|
846
|
+
// `source` covers shell's `source ./lib.sh` (and Tcl's `source`).
|
|
847
|
+
return /^\s*(?:import|from|require|require_relative|use|using|include|#\s*include|uses|open|package|extends|implements|alias|source)\b/i.test(line);
|
|
848
|
+
}
|
|
849
|
+
/**
|
|
850
|
+
* Map an imported name/path to one or more relative file paths under
|
|
851
|
+
* the project root. Tries multiple candidate shapes per import; only
|
|
852
|
+
* those that exist in `fileSet` are returned.
|
|
853
|
+
*
|
|
854
|
+
* Examples (with `fromFile = lib/main.py`):
|
|
855
|
+
* "foo.bar" → ["foo/bar.py", "foo/bar/__init__.py", "lib/foo/bar.py", "lib/foo/bar/__init__.py"]
|
|
856
|
+
* "./user_service" → ["lib/user_service.rb", "lib/user_service.py", ...]
|
|
857
|
+
* "../utils" → ["utils.rb", "utils.py", ...]
|
|
858
|
+
* "Greeter" → ["Greeter.pm", "Greeter.pas", ...]
|
|
859
|
+
*/
|
|
860
|
+
function resolveImportToFiles(rawName, fromFile, fileSet) {
|
|
861
|
+
if (!rawName)
|
|
862
|
+
return [];
|
|
863
|
+
const fromDir = dirname(fromFile);
|
|
864
|
+
// Discard import names that obviously belong to standard libraries
|
|
865
|
+
// or external packages (no slashes, no path-shape, and the leading
|
|
866
|
+
// segment looks like a stdlib name). False negatives here are
|
|
867
|
+
// cheap; false positives blow up.
|
|
868
|
+
// Heuristic: relative paths (./ or ../) ALWAYS attempt resolution.
|
|
869
|
+
// Bare names attempt resolution but the existence check in fileSet
|
|
870
|
+
// filters most spurious matches.
|
|
871
|
+
const candidates = [];
|
|
872
|
+
const exts = [
|
|
873
|
+
".rb", ".pl", ".pm", ".pas", ".pp", ".lua", ".ex", ".exs", ".erl",
|
|
874
|
+
".swift", ".kt", ".kts", ".scala", ".dart", ".zig", ".nim",
|
|
875
|
+
".ml", ".mli", ".fs", ".hs", ".clj", ".vb", ".tcl", ".r",
|
|
876
|
+
".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
|
|
877
|
+
".py", ".pyi", ".go", ".rs", ".c", ".h", ".cpp", ".hpp", ".cc",
|
|
878
|
+
".cs", ".php", ".sh", ".bash",
|
|
879
|
+
"", // shell scripts with no extension
|
|
880
|
+
];
|
|
881
|
+
const tryWithExts = (base) => {
|
|
882
|
+
for (const ext of exts) {
|
|
883
|
+
candidates.push(`${base}${ext}`);
|
|
884
|
+
}
|
|
885
|
+
// Python-style package init.
|
|
886
|
+
candidates.push(`${base}/__init__.py`);
|
|
887
|
+
candidates.push(`${base}/index.js`);
|
|
888
|
+
candidates.push(`${base}/index.ts`);
|
|
889
|
+
candidates.push(`${base}/mod.rs`);
|
|
890
|
+
candidates.push(`${base}/lib.rs`);
|
|
891
|
+
};
|
|
892
|
+
const normalised = rawName.trim();
|
|
893
|
+
if (!normalised)
|
|
894
|
+
return [];
|
|
895
|
+
// Universal fallback — try the captured name verbatim as a
|
|
896
|
+
// filename, both at the project root and as a sibling of the
|
|
897
|
+
// importing file. Catches cases where the name has a dot that's
|
|
898
|
+
// NOT a module separator but a file extension (COBOL `COPY
|
|
899
|
+
// customer.cpy`, Verilog `\`include "alu.v"`, anything where the
|
|
900
|
+
// pattern captures `name.ext` directly). For dotted module names
|
|
901
|
+
// these candidates won't exist as files, so this branch is a no-op
|
|
902
|
+
// for the dotted-module case.
|
|
903
|
+
candidates.push(normalised);
|
|
904
|
+
candidates.push(pathJoinNoCollapse(fromDir, normalised));
|
|
905
|
+
// Relative path imports
|
|
906
|
+
if (normalised.startsWith("./") || normalised.startsWith("../")) {
|
|
907
|
+
const resolved = pathJoinNoCollapse(fromDir, normalised);
|
|
908
|
+
tryWithExts(resolved);
|
|
909
|
+
// Direct as-is (already has extension)
|
|
910
|
+
candidates.push(resolved);
|
|
911
|
+
}
|
|
912
|
+
else if (normalised.includes("/") || normalised.includes("\\")) {
|
|
913
|
+
// Path-like but not explicitly relative — try as-is from project root
|
|
914
|
+
tryWithExts(normalised.replace(/\\/g, "/"));
|
|
915
|
+
candidates.push(normalised.replace(/\\/g, "/"));
|
|
916
|
+
}
|
|
917
|
+
else {
|
|
918
|
+
// Dotted name (Python/Java/Elixir): foo.bar → foo/bar
|
|
919
|
+
// Colon name (Perl): Foo::Bar → Foo/Bar
|
|
920
|
+
const pathish = normalised.replace(/[.:]+/g, "/");
|
|
921
|
+
tryWithExts(pathish);
|
|
922
|
+
// Also try sibling: lib/main.py importing `helpers` may mean lib/helpers.py
|
|
923
|
+
tryWithExts(join(fromDir, pathish).replace(/\\/g, "/"));
|
|
924
|
+
// Try as bare name (Perl `use Greeter` → Greeter.pm at any depth)
|
|
925
|
+
candidates.push(`${normalised}.pm`);
|
|
926
|
+
candidates.push(`${normalised}.pas`);
|
|
927
|
+
candidates.push(`${normalised}.pp`);
|
|
928
|
+
candidates.push(`${normalised}.lua`);
|
|
929
|
+
// sibling versions of bare names
|
|
930
|
+
candidates.push(join(fromDir, `${normalised}.pm`).replace(/\\/g, "/"));
|
|
931
|
+
candidates.push(join(fromDir, `${normalised}.pas`).replace(/\\/g, "/"));
|
|
932
|
+
candidates.push(join(fromDir, `${normalised}.pp`).replace(/\\/g, "/"));
|
|
933
|
+
candidates.push(join(fromDir, `${normalised}.lua`).replace(/\\/g, "/"));
|
|
934
|
+
}
|
|
935
|
+
const found = new Set();
|
|
936
|
+
for (const c of candidates) {
|
|
937
|
+
const normalised = c.replace(/\\/g, "/").replace(/\/+/g, "/");
|
|
938
|
+
if (fileSet.has(normalised))
|
|
939
|
+
found.add(normalised);
|
|
940
|
+
}
|
|
941
|
+
return Array.from(found);
|
|
942
|
+
}
|
|
943
|
+
/**
|
|
944
|
+
* Like `path.join` but preserves "./" prefix and resolves "../" by
|
|
945
|
+
* walking up segments — without depending on node:path's behaviour
|
|
946
|
+
* which collapses to absolute on some inputs.
|
|
947
|
+
*/
|
|
948
|
+
function pathJoinNoCollapse(from, relPath) {
|
|
949
|
+
const parts = (from === "" ? [] : from.split("/"));
|
|
950
|
+
const relParts = relPath.replace(/\\/g, "/").split("/");
|
|
951
|
+
for (const p of relParts) {
|
|
952
|
+
if (p === "" || p === ".")
|
|
953
|
+
continue;
|
|
954
|
+
if (p === "..") {
|
|
955
|
+
if (parts.length > 0)
|
|
956
|
+
parts.pop();
|
|
957
|
+
continue;
|
|
958
|
+
}
|
|
959
|
+
parts.push(p);
|
|
960
|
+
}
|
|
961
|
+
return parts.join("/");
|
|
962
|
+
}
|
|
963
|
+
/* ── config-string extraction ─────────────────────────────────────── */
|
|
964
|
+
function extractStringsFromConfig(content, ext) {
|
|
965
|
+
if (ext === ".json" || ext === ".jsonc" || ext === ".json5") {
|
|
966
|
+
// Best-effort parse. JSONC: strip // and /* */ first. JSON5 we
|
|
967
|
+
// attempt as JSONC; on parse failure we fall through to the
|
|
968
|
+
// regex extractor.
|
|
969
|
+
const stripped = ext === ".json" ? content : content.replace(/\/\*[\s\S]*?\*\//g, "").replace(/^\s*\/\/.*$/gm, "");
|
|
970
|
+
try {
|
|
971
|
+
const obj = JSON.parse(stripped);
|
|
972
|
+
const out = [];
|
|
973
|
+
walkJson(obj, out);
|
|
974
|
+
return out;
|
|
975
|
+
}
|
|
976
|
+
catch {
|
|
977
|
+
// Fall through to regex extractor below.
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
// Regex fallback: any double-quoted string. False positives here
|
|
981
|
+
// are fine — they get filtered by the filesystem-existence gate.
|
|
982
|
+
const matches = content.matchAll(/"([^"\n]{1,200})"/g);
|
|
983
|
+
const out = [];
|
|
984
|
+
for (const m of matches)
|
|
985
|
+
out.push(m[1]);
|
|
986
|
+
// YAML: also single-quoted strings, and bare path-shaped scalars
|
|
987
|
+
// (heuristic; the existence gate filters).
|
|
988
|
+
const single = content.matchAll(/'([^'\n]{1,200})'/g);
|
|
989
|
+
for (const m of single)
|
|
990
|
+
out.push(m[1]);
|
|
991
|
+
// YAML/TOML unquoted scalar values. The single most common GitHub
|
|
992
|
+
// Actions / Ansible / Docker Compose / CI-DSL idiom is unquoted —
|
|
993
|
+
// run: ./scripts/build.sh
|
|
994
|
+
// path = "lib/x.js" (TOML, already caught above by quotes)
|
|
995
|
+
// entrypoint: bin/server
|
|
996
|
+
// — so without this branch we miss the headline use case for
|
|
997
|
+
// low-code-DSL connection discovery. Match `key: value` (YAML) and
|
|
998
|
+
// `key = value` (TOML) where the value is unquoted, contains no
|
|
999
|
+
// whitespace, and isn't a comment. The filesystem-existence gate
|
|
1000
|
+
// downstream filters everything that isn't a real file.
|
|
1001
|
+
if (ext === ".yml" || ext === ".yaml" || ext === ".toml") {
|
|
1002
|
+
const scalarLine = /^[ \t]*-?[ \t]*[\w\-.]+[ \t]*[:=][ \t]*([^\s#'"`[{][^\s#]*?)[ \t]*(?:#.*)?$/gm;
|
|
1003
|
+
let m;
|
|
1004
|
+
while ((m = scalarLine.exec(content)) !== null) {
|
|
1005
|
+
const v = m[1];
|
|
1006
|
+
// Reject values that obviously aren't paths: pure numbers,
|
|
1007
|
+
// pure booleans, version specifiers, scalar YAML markers.
|
|
1008
|
+
if (!v)
|
|
1009
|
+
continue;
|
|
1010
|
+
if (/^(true|false|null|yes|no|on|off|~)$/i.test(v))
|
|
1011
|
+
continue;
|
|
1012
|
+
if (/^-?\d+(\.\d+)?$/.test(v))
|
|
1013
|
+
continue;
|
|
1014
|
+
if (/^[a-z]+:\/\//i.test(v))
|
|
1015
|
+
continue; // URLs
|
|
1016
|
+
out.push(v);
|
|
1017
|
+
}
|
|
1018
|
+
// YAML bare-list items: ` - deployment.yaml` — Kustomize's
|
|
1019
|
+
// `resources:` list, Ansible's `roles:` list, and most other
|
|
1020
|
+
// no-code DSLs use this shape. The previous `scalarLine` regex
|
|
1021
|
+
// requires a `key: value` shape and misses these. The same
|
|
1022
|
+
// existence gate downstream filters non-paths.
|
|
1023
|
+
const bareListItem = /^[ \t]*-[ \t]+([^\s#'"`[{][^\s#]*?)[ \t]*(?:#.*)?$/gm;
|
|
1024
|
+
while ((m = bareListItem.exec(content)) !== null) {
|
|
1025
|
+
const v = m[1];
|
|
1026
|
+
if (!v)
|
|
1027
|
+
continue;
|
|
1028
|
+
if (/^(true|false|null|yes|no|on|off|~)$/i.test(v))
|
|
1029
|
+
continue;
|
|
1030
|
+
if (/^-?\d+(\.\d+)?$/.test(v))
|
|
1031
|
+
continue;
|
|
1032
|
+
if (/^[a-z]+:\/\//i.test(v))
|
|
1033
|
+
continue;
|
|
1034
|
+
out.push(v);
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
return out;
|
|
1038
|
+
}
|
|
1039
|
+
function walkJson(node, out) {
|
|
1040
|
+
if (node === null || node === undefined)
|
|
1041
|
+
return;
|
|
1042
|
+
if (typeof node === "string") {
|
|
1043
|
+
if (node.length >= 2 && node.length < 500)
|
|
1044
|
+
out.push(node);
|
|
1045
|
+
return;
|
|
1046
|
+
}
|
|
1047
|
+
if (Array.isArray(node)) {
|
|
1048
|
+
for (const v of node)
|
|
1049
|
+
walkJson(v, out);
|
|
1050
|
+
return;
|
|
1051
|
+
}
|
|
1052
|
+
if (typeof node === "object") {
|
|
1053
|
+
for (const [k, v] of Object.entries(node)) {
|
|
1054
|
+
// Hint-tracking: the key name doesn't gate emission (existence
|
|
1055
|
+
// does) but PATH_KEY_HINTS is useful for the evidence label.
|
|
1056
|
+
// For now we just descend; key hints are advisory only.
|
|
1057
|
+
void k;
|
|
1058
|
+
walkJson(v, out);
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
}
|
|
1062
|
+
/**
|
|
1063
|
+
* Map a candidate string from a config file to a relative path under
|
|
1064
|
+
* the project root. Tries (a) direct path under root, (b) relative to
|
|
1065
|
+
* config file's directory. Returns the resolved file if it exists in
|
|
1066
|
+
* `fileSet`; null otherwise.
|
|
1067
|
+
*/
|
|
1068
|
+
function resolveConfigPath(value, fromFile, fileSet) {
|
|
1069
|
+
if (!value || value.length < 2 || value.length > 500)
|
|
1070
|
+
return null;
|
|
1071
|
+
// Skip values that obviously aren't paths.
|
|
1072
|
+
if (value.includes("\n"))
|
|
1073
|
+
return null;
|
|
1074
|
+
// Skip URLs.
|
|
1075
|
+
if (/^https?:\/\//.test(value) || /^[a-z]+:\/\//.test(value))
|
|
1076
|
+
return null;
|
|
1077
|
+
const fromDir = dirname(fromFile);
|
|
1078
|
+
const cleaned = value.replace(/\\/g, "/").replace(/^\.\//, "").replace(/\/$/, "");
|
|
1079
|
+
const candidates = [
|
|
1080
|
+
cleaned, // as-is relative to project root
|
|
1081
|
+
pathJoinNoCollapse(fromDir, value).replace(/\/$/, ""), // relative to the config file
|
|
1082
|
+
];
|
|
1083
|
+
// Direct file match — covers the typical case: a config value
|
|
1084
|
+
// points at an exact file in the repo.
|
|
1085
|
+
for (const c of candidates) {
|
|
1086
|
+
const norm = c.replace(/\\/g, "/").replace(/\/+/g, "/").replace(/^\//, "");
|
|
1087
|
+
if (fileSet.has(norm))
|
|
1088
|
+
return norm;
|
|
1089
|
+
}
|
|
1090
|
+
// Directory-shaped match — covers Docker Compose `build:
|
|
1091
|
+
// services/api`, GitHub composite actions `uses: ./.github/actions/build`,
|
|
1092
|
+
// and any other DSL convention where a value points at a DIRECTORY
|
|
1093
|
+
// whose canonical entry file is the actual edge target. We prefer
|
|
1094
|
+
// well-known entry-point filenames (Dockerfile, action.yml,
|
|
1095
|
+
// package.json, …) when present; otherwise the directory has no
|
|
1096
|
+
// canonical entry and we don't fabricate one.
|
|
1097
|
+
const ENTRY_FILE_CANDIDATES = [
|
|
1098
|
+
"Dockerfile",
|
|
1099
|
+
"action.yml",
|
|
1100
|
+
"action.yaml",
|
|
1101
|
+
"package.json",
|
|
1102
|
+
"Cargo.toml",
|
|
1103
|
+
"go.mod",
|
|
1104
|
+
"index.js",
|
|
1105
|
+
"index.ts",
|
|
1106
|
+
"main.py",
|
|
1107
|
+
"__init__.py",
|
|
1108
|
+
"mod.rs",
|
|
1109
|
+
"build.gradle",
|
|
1110
|
+
"build.gradle.kts",
|
|
1111
|
+
"main.tf",
|
|
1112
|
+
];
|
|
1113
|
+
for (const c of candidates) {
|
|
1114
|
+
const dirNorm = c.replace(/\\/g, "/").replace(/\/+/g, "/").replace(/^\//, "");
|
|
1115
|
+
if (!dirNorm || fileSet.has(dirNorm))
|
|
1116
|
+
continue;
|
|
1117
|
+
// Only treat it as a directory if at least one file under that
|
|
1118
|
+
// prefix exists. (Avoids fabricating edges to "directories" that
|
|
1119
|
+
// are really nonexistent paths.)
|
|
1120
|
+
const dirPrefix = dirNorm + "/";
|
|
1121
|
+
let hasAnyFile = false;
|
|
1122
|
+
for (const f of fileSet) {
|
|
1123
|
+
if (f.startsWith(dirPrefix)) {
|
|
1124
|
+
hasAnyFile = true;
|
|
1125
|
+
break;
|
|
1126
|
+
}
|
|
1127
|
+
}
|
|
1128
|
+
if (!hasAnyFile)
|
|
1129
|
+
continue;
|
|
1130
|
+
for (const entry of ENTRY_FILE_CANDIDATES) {
|
|
1131
|
+
const probe = dirPrefix + entry;
|
|
1132
|
+
if (fileSet.has(probe))
|
|
1133
|
+
return probe;
|
|
1134
|
+
}
|
|
1135
|
+
// Directory exists but has no canonical entry → don't emit;
|
|
1136
|
+
// the agent can navigate by directory name from the config
|
|
1137
|
+
// memory itself, and we'd rather skip than emit a noisy edge
|
|
1138
|
+
// to some random file.
|
|
1139
|
+
}
|
|
1140
|
+
return null;
|
|
1141
|
+
}
|
|
1142
|
+
/* ── filename-class coupling ──────────────────────────────────────── */
|
|
1143
|
+
/**
|
|
1144
|
+
* Returns true if the file's basename plausibly corresponds to the
|
|
1145
|
+
* identifier — `user_service.rb` ↔ `UserService`, `MyUnit.pas` ↔
|
|
1146
|
+
* `MyUnit`. This is a corroboration signal, not the only signal.
|
|
1147
|
+
*/
|
|
1148
|
+
function filenameCouples(file, identifier) {
|
|
1149
|
+
const base = basename(file, extname(file));
|
|
1150
|
+
const candidates = new Set([
|
|
1151
|
+
base,
|
|
1152
|
+
toCamelCase(base),
|
|
1153
|
+
toPascalCase(base),
|
|
1154
|
+
]);
|
|
1155
|
+
if (candidates.has(identifier))
|
|
1156
|
+
return true;
|
|
1157
|
+
const lc = identifier.toLowerCase();
|
|
1158
|
+
for (const c of candidates) {
|
|
1159
|
+
if (c.toLowerCase() === lc)
|
|
1160
|
+
return true;
|
|
1161
|
+
}
|
|
1162
|
+
return false;
|
|
1163
|
+
}
|
|
1164
|
+
function toCamelCase(s) {
|
|
1165
|
+
return s.replace(/[_-](\w)/g, (_, c) => c.toUpperCase());
|
|
1166
|
+
}
|
|
1167
|
+
function toPascalCase(s) {
|
|
1168
|
+
const c = toCamelCase(s);
|
|
1169
|
+
return c.charAt(0).toUpperCase() + c.slice(1);
|
|
1170
|
+
}
|
|
1171
|
+
/* ── edge bookkeeping ─────────────────────────────────────────────── */
|
|
1172
|
+
function addEdge(edges, src, tgt, evidence, maxEdges = MAX_EDGES_TOTAL) {
|
|
1173
|
+
if (edges.size >= maxEdges)
|
|
1174
|
+
return;
|
|
1175
|
+
const key = `${src}\x00${tgt}`;
|
|
1176
|
+
let e = edges.get(key);
|
|
1177
|
+
if (!e) {
|
|
1178
|
+
e = { src, tgt, evidences: new Set() };
|
|
1179
|
+
edges.set(key, e);
|
|
1180
|
+
}
|
|
1181
|
+
e.evidences.add(evidence);
|
|
1182
|
+
}
|
|
1183
|
+
function pickPrimaryEvidence(evidences) {
|
|
1184
|
+
// Ordered by confidence: filesystem-grounded signals first.
|
|
1185
|
+
const order = [
|
|
1186
|
+
"config-path",
|
|
1187
|
+
"import-resolved",
|
|
1188
|
+
"import+filename",
|
|
1189
|
+
"import+identifier",
|
|
1190
|
+
"filename+identifier",
|
|
1191
|
+
];
|
|
1192
|
+
for (const e of order) {
|
|
1193
|
+
if (evidences.has(e))
|
|
1194
|
+
return e;
|
|
1195
|
+
}
|
|
1196
|
+
return Array.from(evidences)[0] ?? "unknown";
|
|
1197
|
+
}
|
|
1198
|
+
function basenameTagsForPair(src, tgt) {
|
|
1199
|
+
const a = basename(src, extname(src)).toLowerCase().replace(/[^a-z0-9]+/g, "-");
|
|
1200
|
+
const b = basename(tgt, extname(tgt)).toLowerCase().replace(/[^a-z0-9]+/g, "-");
|
|
1201
|
+
const out = [];
|
|
1202
|
+
if (a)
|
|
1203
|
+
out.push(a);
|
|
1204
|
+
if (b && b !== a)
|
|
1205
|
+
out.push(b);
|
|
1206
|
+
return out;
|
|
1207
|
+
}
|