opencode-diane 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +180 -0
- package/LICENSE +21 -0
- package/README.md +206 -0
- package/WIKI.md +1430 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.js +1632 -0
- package/dist/ingest/adaptive.d.ts +47 -0
- package/dist/ingest/adaptive.js +182 -0
- package/dist/ingest/code-health.d.ts +58 -0
- package/dist/ingest/code-health.js +202 -0
- package/dist/ingest/code-map.d.ts +71 -0
- package/dist/ingest/code-map.js +670 -0
- package/dist/ingest/cross-refs.d.ts +59 -0
- package/dist/ingest/cross-refs.js +1207 -0
- package/dist/ingest/docs.d.ts +49 -0
- package/dist/ingest/docs.js +325 -0
- package/dist/ingest/git.d.ts +77 -0
- package/dist/ingest/git.js +390 -0
- package/dist/ingest/live-session.d.ts +101 -0
- package/dist/ingest/live-session.js +173 -0
- package/dist/ingest/project-notes.d.ts +28 -0
- package/dist/ingest/project-notes.js +102 -0
- package/dist/ingest/project.d.ts +35 -0
- package/dist/ingest/project.js +430 -0
- package/dist/ingest/session-snapshot.d.ts +63 -0
- package/dist/ingest/session-snapshot.js +94 -0
- package/dist/ingest/sessions.d.ts +29 -0
- package/dist/ingest/sessions.js +164 -0
- package/dist/ingest/tables.d.ts +52 -0
- package/dist/ingest/tables.js +360 -0
- package/dist/mining/skill-miner.d.ts +53 -0
- package/dist/mining/skill-miner.js +234 -0
- package/dist/search/bm25.d.ts +81 -0
- package/dist/search/bm25.js +334 -0
- package/dist/search/e5-embedder.d.ts +30 -0
- package/dist/search/e5-embedder.js +91 -0
- package/dist/search/embed-pass.d.ts +26 -0
- package/dist/search/embed-pass.js +43 -0
- package/dist/search/embedder.d.ts +58 -0
- package/dist/search/embedder.js +85 -0
- package/dist/search/inverted-index.d.ts +51 -0
- package/dist/search/inverted-index.js +139 -0
- package/dist/search/ppr.d.ts +44 -0
- package/dist/search/ppr.js +118 -0
- package/dist/search/tokenize.d.ts +26 -0
- package/dist/search/tokenize.js +98 -0
- package/dist/store/eviction.d.ts +16 -0
- package/dist/store/eviction.js +37 -0
- package/dist/store/repository.d.ts +222 -0
- package/dist/store/repository.js +420 -0
- package/dist/store/sqlite-store.d.ts +89 -0
- package/dist/store/sqlite-store.js +252 -0
- package/dist/store/vector-store.d.ts +66 -0
- package/dist/store/vector-store.js +160 -0
- package/dist/types.d.ts +385 -0
- package/dist/types.js +9 -0
- package/dist/utils/file-log.d.ts +87 -0
- package/dist/utils/file-log.js +215 -0
- package/dist/utils/peer-detection.d.ts +45 -0
- package/dist/utils/peer-detection.js +90 -0
- package/dist/utils/shell.d.ts +43 -0
- package/dist/utils/shell.js +110 -0
- package/dist/utils/usage-skill.d.ts +42 -0
- package/dist/utils/usage-skill.js +129 -0
- package/dist/utils/xlsx.d.ts +36 -0
- package/dist/utils/xlsx.js +270 -0
- package/grammars/tree-sitter-c.wasm +0 -0
- package/grammars/tree-sitter-c_sharp.wasm +0 -0
- package/grammars/tree-sitter-cpp.wasm +0 -0
- package/grammars/tree-sitter-css.wasm +0 -0
- package/grammars/tree-sitter-go.wasm +0 -0
- package/grammars/tree-sitter-html.wasm +0 -0
- package/grammars/tree-sitter-java.wasm +0 -0
- package/grammars/tree-sitter-javascript.wasm +0 -0
- package/grammars/tree-sitter-json.wasm +0 -0
- package/grammars/tree-sitter-php.wasm +0 -0
- package/grammars/tree-sitter-python.wasm +0 -0
- package/grammars/tree-sitter-rust.wasm +0 -0
- package/grammars/tree-sitter-typescript.wasm +0 -0
- package/package.json +80 -0
|
@@ -0,0 +1,670 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Code-map ingestion — an Aider-style "repo map": for each source
|
|
3
|
+
* file, the *signatures* of its top-level definitions (functions,
|
|
4
|
+
* classes, methods, types) with the bodies stripped. The agent gets
|
|
5
|
+
* the shape of the codebase without reading every file.
|
|
6
|
+
*
|
|
7
|
+
* This is the one part of the plugin that is NOT convention-free or
|
|
8
|
+
* dependency-light, and that is a deliberate, opt-in trade:
|
|
9
|
+
*
|
|
10
|
+
* - It needs `web-tree-sitter` (~290 KB) plus a vendored `.wasm`
|
|
11
|
+
* grammar per supported language (~10.3 MB for the eleven below —
|
|
12
|
+
* C++ alone is 4.7 MB and TypeScript 2.3 MB). The rest of the
|
|
13
|
+
* plugin is a ~77 KB source drop with one tiny dependency; this
|
|
14
|
+
* feature is most of the install weight.
|
|
15
|
+
* - It is inherently language-aware: each grammar needs to know
|
|
16
|
+
* which node types are "definitions" (or selectors / keys /
|
|
17
|
+
* elements). That per-language table is the `LANG_SPECS` map
|
|
18
|
+
* below — contained, declarative, and the only place language
|
|
19
|
+
* knowledge lives.
|
|
20
|
+
*
|
|
21
|
+
* Because of that, code-map is gated behind `config.enableCodeMap`
|
|
22
|
+
* and defaults OFF. When disabled, none of this loads — `import()` of
|
|
23
|
+
* `web-tree-sitter` only happens inside `ingestCodeMap`. Languages we
|
|
24
|
+
* have no grammar for are simply skipped; the rest of the plugin is
|
|
25
|
+
* unaffected.
|
|
26
|
+
*
|
|
27
|
+
* Signatures are stored one `code-map` memory per file via
|
|
28
|
+
* `upsertBySubject`, so they're recallable, co-change-boosted, and
|
|
29
|
+
* token-budgeted like every other memory, and a re-scan replaces
|
|
30
|
+
* rather than accumulates.
|
|
31
|
+
*/
|
|
32
|
+
import { readdir, readFile, stat } from "node:fs/promises";
|
|
33
|
+
import { extname, join } from "node:path";
|
|
34
|
+
import { fileURLToPath } from "node:url";
|
|
35
|
+
const CATEGORY = "code-map";
|
|
36
|
+
/** Directories never worth walking for a signature map. */
|
|
37
|
+
const SKIP_DIRS = new Set([
|
|
38
|
+
".git",
|
|
39
|
+
"node_modules",
|
|
40
|
+
".venv",
|
|
41
|
+
"venv",
|
|
42
|
+
"__pycache__",
|
|
43
|
+
".tox",
|
|
44
|
+
"dist",
|
|
45
|
+
"build",
|
|
46
|
+
"target",
|
|
47
|
+
"vendor",
|
|
48
|
+
".next",
|
|
49
|
+
"coverage",
|
|
50
|
+
".idea",
|
|
51
|
+
".vscode",
|
|
52
|
+
]);
|
|
53
|
+
const LANG_SPECS = {
|
|
54
|
+
javascript: {
|
|
55
|
+
grammar: "tree-sitter-javascript",
|
|
56
|
+
defNodes: new Set([
|
|
57
|
+
"function_declaration",
|
|
58
|
+
"generator_function_declaration",
|
|
59
|
+
"class_declaration",
|
|
60
|
+
"method_definition",
|
|
61
|
+
]),
|
|
62
|
+
},
|
|
63
|
+
typescript: {
|
|
64
|
+
grammar: "tree-sitter-typescript",
|
|
65
|
+
defNodes: new Set([
|
|
66
|
+
"function_declaration",
|
|
67
|
+
"generator_function_declaration",
|
|
68
|
+
"class_declaration",
|
|
69
|
+
"abstract_class_declaration",
|
|
70
|
+
"method_definition",
|
|
71
|
+
"interface_declaration",
|
|
72
|
+
"type_alias_declaration",
|
|
73
|
+
"enum_declaration",
|
|
74
|
+
]),
|
|
75
|
+
},
|
|
76
|
+
python: {
|
|
77
|
+
grammar: "tree-sitter-python",
|
|
78
|
+
defNodes: new Set(["function_definition", "class_definition"]),
|
|
79
|
+
},
|
|
80
|
+
go: {
|
|
81
|
+
grammar: "tree-sitter-go",
|
|
82
|
+
defNodes: new Set([
|
|
83
|
+
"function_declaration",
|
|
84
|
+
"method_declaration",
|
|
85
|
+
"type_declaration",
|
|
86
|
+
]),
|
|
87
|
+
},
|
|
88
|
+
rust: {
|
|
89
|
+
grammar: "tree-sitter-rust",
|
|
90
|
+
defNodes: new Set([
|
|
91
|
+
"function_item",
|
|
92
|
+
"struct_item",
|
|
93
|
+
"enum_item",
|
|
94
|
+
"trait_item",
|
|
95
|
+
"impl_item",
|
|
96
|
+
"mod_item",
|
|
97
|
+
"macro_definition",
|
|
98
|
+
"type_item",
|
|
99
|
+
]),
|
|
100
|
+
},
|
|
101
|
+
java: {
|
|
102
|
+
grammar: "tree-sitter-java",
|
|
103
|
+
defNodes: new Set([
|
|
104
|
+
"class_declaration",
|
|
105
|
+
"interface_declaration",
|
|
106
|
+
"enum_declaration",
|
|
107
|
+
"record_declaration",
|
|
108
|
+
"annotation_type_declaration",
|
|
109
|
+
"method_declaration",
|
|
110
|
+
"constructor_declaration",
|
|
111
|
+
]),
|
|
112
|
+
},
|
|
113
|
+
c: {
|
|
114
|
+
grammar: "tree-sitter-c",
|
|
115
|
+
defNodes: new Set([
|
|
116
|
+
"function_definition",
|
|
117
|
+
"struct_specifier",
|
|
118
|
+
"union_specifier",
|
|
119
|
+
"enum_specifier",
|
|
120
|
+
"type_definition",
|
|
121
|
+
]),
|
|
122
|
+
},
|
|
123
|
+
cpp: {
|
|
124
|
+
grammar: "tree-sitter-cpp",
|
|
125
|
+
defNodes: new Set([
|
|
126
|
+
"function_definition",
|
|
127
|
+
"class_specifier",
|
|
128
|
+
"struct_specifier",
|
|
129
|
+
"union_specifier",
|
|
130
|
+
"enum_specifier",
|
|
131
|
+
"namespace_definition",
|
|
132
|
+
"template_declaration",
|
|
133
|
+
"type_definition",
|
|
134
|
+
]),
|
|
135
|
+
},
|
|
136
|
+
css: {
|
|
137
|
+
// A CSS "definition" is a selector rule or at-rule. The signatures
|
|
138
|
+
// extractor's "text up to the first {" already yields exactly the
|
|
139
|
+
// selector (`.nav > li`, `@media (max-width: 600px)`), so CSS uses
|
|
140
|
+
// the same path as code — it just has different node types.
|
|
141
|
+
grammar: "tree-sitter-css",
|
|
142
|
+
defNodes: new Set(["rule_set", "media_statement", "keyframes_statement", "supports_statement"]),
|
|
143
|
+
},
|
|
144
|
+
json: {
|
|
145
|
+
// JSON has no definitions; its "shape" is its top-level keys.
|
|
146
|
+
// NOTE: the project-facts ingester already summarises recognised
|
|
147
|
+
// JSON manifests by their keys — code-map JSON extends that to
|
|
148
|
+
// *every* .json file in the tree, at the cost of some overlap.
|
|
149
|
+
grammar: "tree-sitter-json",
|
|
150
|
+
defNodes: new Set(["pair"]),
|
|
151
|
+
extractor: "json-shape",
|
|
152
|
+
},
|
|
153
|
+
html: {
|
|
154
|
+
// HTML has no definitions; its useful skeleton is the set of
|
|
155
|
+
// elements bearing an `id` plus the major structural landmarks.
|
|
156
|
+
grammar: "tree-sitter-html",
|
|
157
|
+
defNodes: new Set(["element"]),
|
|
158
|
+
extractor: "html-skeleton",
|
|
159
|
+
},
|
|
160
|
+
csharp: {
|
|
161
|
+
// C# definition nodes: types (class/interface/struct/enum/record/delegate),
|
|
162
|
+
// namespace containers, constructors, and methods. Properties are
|
|
163
|
+
// intentionally excluded — they dominate DTO classes but add little
|
|
164
|
+
// navigation value beyond the containing type.
|
|
165
|
+
grammar: "tree-sitter-c_sharp",
|
|
166
|
+
defNodes: new Set([
|
|
167
|
+
"class_declaration",
|
|
168
|
+
"interface_declaration",
|
|
169
|
+
"struct_declaration",
|
|
170
|
+
"enum_declaration",
|
|
171
|
+
"record_declaration",
|
|
172
|
+
"delegate_declaration",
|
|
173
|
+
"namespace_declaration",
|
|
174
|
+
"file_scoped_namespace_declaration",
|
|
175
|
+
"constructor_declaration",
|
|
176
|
+
"method_declaration",
|
|
177
|
+
"operator_declaration",
|
|
178
|
+
"event_declaration",
|
|
179
|
+
]),
|
|
180
|
+
},
|
|
181
|
+
php: {
|
|
182
|
+
// PHP definition nodes: functions, methods, and all type-like constructs.
|
|
183
|
+
// Namespaces included — they reveal the file's logical location within
|
|
184
|
+
// the package tree and are one of the most useful facts for navigation.
|
|
185
|
+
grammar: "tree-sitter-php",
|
|
186
|
+
defNodes: new Set([
|
|
187
|
+
"function_definition",
|
|
188
|
+
"method_declaration",
|
|
189
|
+
"class_declaration",
|
|
190
|
+
"interface_declaration",
|
|
191
|
+
"trait_declaration",
|
|
192
|
+
"enum_declaration",
|
|
193
|
+
"namespace_definition",
|
|
194
|
+
]),
|
|
195
|
+
},
|
|
196
|
+
};
|
|
197
|
+
const EXT_TO_LANG = {
|
|
198
|
+
".js": "javascript",
|
|
199
|
+
".jsx": "javascript",
|
|
200
|
+
".mjs": "javascript",
|
|
201
|
+
".cjs": "javascript",
|
|
202
|
+
".ts": "typescript",
|
|
203
|
+
".tsx": "typescript",
|
|
204
|
+
".mts": "typescript",
|
|
205
|
+
".cts": "typescript",
|
|
206
|
+
".py": "python",
|
|
207
|
+
".pyi": "python",
|
|
208
|
+
".go": "go",
|
|
209
|
+
".rs": "rust",
|
|
210
|
+
".java": "java",
|
|
211
|
+
// C vs C++: .h is ambiguous; convention says plain .h → C, and the
|
|
212
|
+
// C++-specific header suffixes → cpp. tree-sitter is error-tolerant,
|
|
213
|
+
// so a C++ .h parsed as C still yields most of its signatures.
|
|
214
|
+
".c": "c",
|
|
215
|
+
".h": "c",
|
|
216
|
+
".cc": "cpp",
|
|
217
|
+
".cpp": "cpp",
|
|
218
|
+
".cxx": "cpp",
|
|
219
|
+
".c++": "cpp",
|
|
220
|
+
".hpp": "cpp",
|
|
221
|
+
".hxx": "cpp",
|
|
222
|
+
".hh": "cpp",
|
|
223
|
+
".h++": "cpp",
|
|
224
|
+
".css": "css",
|
|
225
|
+
".json": "json",
|
|
226
|
+
".jsonc": "json",
|
|
227
|
+
".html": "html",
|
|
228
|
+
".htm": "html",
|
|
229
|
+
".cs": "csharp",
|
|
230
|
+
".php": "php",
|
|
231
|
+
".phtml": "php",
|
|
232
|
+
};
|
|
233
|
+
/** Don't parse files larger than this — huge generated files are noise. */
|
|
234
|
+
const MAX_FILE_BYTES = 400 * 1024;
|
|
235
|
+
/** Default cap on files scanned — adaptive config overrides per repo size. */
|
|
236
|
+
const DEFAULT_MAX_FILES = 4000;
|
|
237
|
+
/** Cap signatures stored per file — keeps each memory compact. */
|
|
238
|
+
const MAX_SIGS_PER_FILE = 40;
|
|
239
|
+
let enginePromise = null;
|
|
240
|
+
async function buildEngine(packageDir) {
|
|
241
|
+
// web-tree-sitter is a heavy, optional dependency — only touch it
|
|
242
|
+
// when code-map is actually being run.
|
|
243
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
244
|
+
let ParserClass;
|
|
245
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
246
|
+
let LanguageClass;
|
|
247
|
+
try {
|
|
248
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
249
|
+
const mod = await import("web-tree-sitter");
|
|
250
|
+
// web-tree-sitter 0.25.x exposes `Parser` and `Language` as named
|
|
251
|
+
// exports; the grammar loader moved from `Parser.Language.load` to a
|
|
252
|
+
// standalone `Language.load`. Fall back through the older shapes so
|
|
253
|
+
// a version bump in either direction degrades gracefully.
|
|
254
|
+
ParserClass = mod.Parser ?? mod.default?.Parser ?? mod.default ?? mod;
|
|
255
|
+
LanguageClass = mod.Language ?? mod.default?.Language ?? ParserClass?.Language;
|
|
256
|
+
await ParserClass.init();
|
|
257
|
+
if (typeof LanguageClass?.load !== "function") {
|
|
258
|
+
throw new Error("web-tree-sitter: no Language.load entry point");
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
catch (err) {
|
|
262
|
+
return {
|
|
263
|
+
unavailableReason: "web-tree-sitter unavailable: " +
|
|
264
|
+
(err instanceof Error ? err.message : String(err)),
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
const grammarsDir = resolveGrammarsDir(packageDir);
|
|
268
|
+
const languageCache = new Map();
|
|
269
|
+
async function getLanguage(lang) {
|
|
270
|
+
if (languageCache.has(lang))
|
|
271
|
+
return languageCache.get(lang) ?? null;
|
|
272
|
+
const spec = LANG_SPECS[lang];
|
|
273
|
+
if (!spec) {
|
|
274
|
+
languageCache.set(lang, null);
|
|
275
|
+
return null;
|
|
276
|
+
}
|
|
277
|
+
try {
|
|
278
|
+
const wasmPath = join(grammarsDir, `${spec.grammar}.wasm`);
|
|
279
|
+
const L = await LanguageClass.load(wasmPath);
|
|
280
|
+
languageCache.set(lang, L);
|
|
281
|
+
return L;
|
|
282
|
+
}
|
|
283
|
+
catch {
|
|
284
|
+
languageCache.set(lang, null); // grammar missing/incompatible — skip lang
|
|
285
|
+
return null;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
return { ParserClass, getLanguage };
|
|
289
|
+
}
|
|
290
|
+
/** Lazily build (once) and return the shared tree-sitter engine. */
|
|
291
|
+
async function getEngine(packageDir) {
|
|
292
|
+
if (!enginePromise)
|
|
293
|
+
enginePromise = buildEngine(packageDir);
|
|
294
|
+
const engine = await enginePromise;
|
|
295
|
+
// Don't cache a transient failure — let a later call retry.
|
|
296
|
+
if ("unavailableReason" in engine)
|
|
297
|
+
enginePromise = null;
|
|
298
|
+
return engine;
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Parse one source file and upsert its `code-map` memory. Shared by
|
|
302
|
+
* the full prefill walk and the per-file live refresh, so both produce
|
|
303
|
+
* byte-identical memories (same subject, same body shape) — which is
|
|
304
|
+
* what lets `upsertBySubject` cleanly replace a stale entry. Never
|
|
305
|
+
* throws; failures just leave counters untouched.
|
|
306
|
+
*/
|
|
307
|
+
async function parseAndStoreFile(repo, root, path, lang,
|
|
308
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
309
|
+
parser, getLanguage, result) {
|
|
310
|
+
let src;
|
|
311
|
+
try {
|
|
312
|
+
const s = await stat(path);
|
|
313
|
+
if (!s.isFile() || s.size > MAX_FILE_BYTES) {
|
|
314
|
+
result.filesSkippedUnsupported += 1;
|
|
315
|
+
return;
|
|
316
|
+
}
|
|
317
|
+
src = await readFile(path, "utf-8");
|
|
318
|
+
}
|
|
319
|
+
catch {
|
|
320
|
+
return;
|
|
321
|
+
}
|
|
322
|
+
const L = await getLanguage(lang);
|
|
323
|
+
if (!L) {
|
|
324
|
+
result.filesSkippedUnsupported += 1;
|
|
325
|
+
return;
|
|
326
|
+
}
|
|
327
|
+
const spec = LANG_SPECS[lang];
|
|
328
|
+
let items;
|
|
329
|
+
try {
|
|
330
|
+
parser.setLanguage(L);
|
|
331
|
+
const tree = parser.parse(src);
|
|
332
|
+
const extractor = spec.extractor ?? "signatures";
|
|
333
|
+
if (extractor === "json-shape") {
|
|
334
|
+
items = extractJsonShape(tree.rootNode, src);
|
|
335
|
+
}
|
|
336
|
+
else if (extractor === "html-skeleton") {
|
|
337
|
+
items = extractHtmlSkeleton(tree.rootNode, src);
|
|
338
|
+
}
|
|
339
|
+
else {
|
|
340
|
+
items = extractSignatures(tree.rootNode, src, spec.defNodes);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
catch {
|
|
344
|
+
return;
|
|
345
|
+
}
|
|
346
|
+
if (!result.languagesSeen.includes(lang))
|
|
347
|
+
result.languagesSeen.push(lang);
|
|
348
|
+
result.filesParsed += 1;
|
|
349
|
+
result.signaturesExtracted += items.length;
|
|
350
|
+
// The noun in the summary line depends on what was extracted —
|
|
351
|
+
// "definitions" for code, "selectors" for CSS, "keys" for JSON,
|
|
352
|
+
// "elements" for HTML — so the agent reads it correctly.
|
|
353
|
+
const noun = lang === "css"
|
|
354
|
+
? "selector"
|
|
355
|
+
: lang === "json"
|
|
356
|
+
? "top-level key"
|
|
357
|
+
: lang === "html"
|
|
358
|
+
? "landmark element"
|
|
359
|
+
: "definition";
|
|
360
|
+
const rel = path.startsWith(root) ? path.slice(root.length).replace(/^\/+/, "") : path;
|
|
361
|
+
const shown = items.slice(0, MAX_SIGS_PER_FILE);
|
|
362
|
+
const body = shown.length === 0
|
|
363
|
+
? `${rel} (${lang}): no ${noun}s found.`
|
|
364
|
+
: `${rel} (${lang}) — ${items.length} ${noun}${items.length === 1 ? "" : "s"}: ` +
|
|
365
|
+
shown.join(" · ") +
|
|
366
|
+
(items.length > shown.length ? ` … (+${items.length - shown.length} more)` : "");
|
|
367
|
+
repo.upsertBySubject({
|
|
368
|
+
category: CATEGORY,
|
|
369
|
+
subject: rel,
|
|
370
|
+
content: body,
|
|
371
|
+
tags: ["code-map", lang, rel],
|
|
372
|
+
source: "tree-sitter:code-map",
|
|
373
|
+
});
|
|
374
|
+
}
|
|
375
|
+
/**
|
|
376
|
+
* Re-index the code-map for a SINGLE file. This is what keeps the index
|
|
377
|
+
* honest when the agent edits code mid-session: the edited file's stale
|
|
378
|
+
* signature memory is replaced (via `upsertBySubject`) with one parsed
|
|
379
|
+
* from the file as it is now. Reuses the cached engine, so after the
|
|
380
|
+
* initial prefill a refresh is just a one-file parse. Never throws.
|
|
381
|
+
*
|
|
382
|
+
* Returns: "updated" (re-indexed, incl. a newly created file),
|
|
383
|
+
* "unsupported" (extension has no grammar — nothing to do),
|
|
384
|
+
* "unavailable" (tree-sitter could not load), or "error".
|
|
385
|
+
*/
|
|
386
|
+
export async function ingestCodeMapForFile(repo, root, absPath, packageDir) {
|
|
387
|
+
const lang = EXT_TO_LANG[extname(absPath).toLowerCase()];
|
|
388
|
+
if (!lang)
|
|
389
|
+
return "unsupported";
|
|
390
|
+
let engine;
|
|
391
|
+
try {
|
|
392
|
+
engine = await getEngine(packageDir);
|
|
393
|
+
}
|
|
394
|
+
catch {
|
|
395
|
+
return "error";
|
|
396
|
+
}
|
|
397
|
+
if ("unavailableReason" in engine)
|
|
398
|
+
return "unavailable";
|
|
399
|
+
const result = {
|
|
400
|
+
filesParsed: 0,
|
|
401
|
+
filesSkippedUnsupported: 0,
|
|
402
|
+
signaturesExtracted: 0,
|
|
403
|
+
languagesSeen: [],
|
|
404
|
+
};
|
|
405
|
+
try {
|
|
406
|
+
const parser = new engine.ParserClass();
|
|
407
|
+
await parseAndStoreFile(repo, root, absPath, lang, parser, engine.getLanguage, result);
|
|
408
|
+
}
|
|
409
|
+
catch {
|
|
410
|
+
return "error";
|
|
411
|
+
}
|
|
412
|
+
return result.filesParsed > 0 ? "updated" : "unsupported";
|
|
413
|
+
}
|
|
414
|
+
export async function ingestCodeMap(repo, root, packageDir, maxFiles = DEFAULT_MAX_FILES) {
|
|
415
|
+
const result = {
|
|
416
|
+
filesParsed: 0,
|
|
417
|
+
filesSkippedUnsupported: 0,
|
|
418
|
+
signaturesExtracted: 0,
|
|
419
|
+
languagesSeen: [],
|
|
420
|
+
};
|
|
421
|
+
const engine = await getEngine(packageDir);
|
|
422
|
+
if ("unavailableReason" in engine) {
|
|
423
|
+
result.unavailableReason = engine.unavailableReason;
|
|
424
|
+
return result;
|
|
425
|
+
}
|
|
426
|
+
const eng = engine; // narrowed — closures below need the non-union type
|
|
427
|
+
const parser = new eng.ParserClass();
|
|
428
|
+
let filesVisited = 0;
|
|
429
|
+
async function walk(dir) {
|
|
430
|
+
if (filesVisited >= maxFiles)
|
|
431
|
+
return;
|
|
432
|
+
let entries;
|
|
433
|
+
try {
|
|
434
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
435
|
+
}
|
|
436
|
+
catch {
|
|
437
|
+
return;
|
|
438
|
+
}
|
|
439
|
+
for (const e of entries) {
|
|
440
|
+
if (filesVisited >= maxFiles)
|
|
441
|
+
return;
|
|
442
|
+
if (e.isDirectory()) {
|
|
443
|
+
if (SKIP_DIRS.has(e.name) || e.name.startsWith("."))
|
|
444
|
+
continue;
|
|
445
|
+
await walk(join(dir, e.name));
|
|
446
|
+
}
|
|
447
|
+
else if (e.isFile()) {
|
|
448
|
+
const lang = EXT_TO_LANG[extname(e.name).toLowerCase()];
|
|
449
|
+
if (!lang)
|
|
450
|
+
continue;
|
|
451
|
+
filesVisited += 1;
|
|
452
|
+
await parseAndStoreFile(repo, root, join(dir, e.name), lang, parser, eng.getLanguage, result);
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
await walk(root);
|
|
457
|
+
result.languagesSeen.sort();
|
|
458
|
+
repo.setIngestedAt(CATEGORY, Date.now());
|
|
459
|
+
return result;
|
|
460
|
+
}
|
|
461
|
+
/* ─── signature extraction ──────────────────────────────────────────── */
|
|
462
|
+
/**
|
|
463
|
+
* Depth-first walk collecting one signature line per definition node.
|
|
464
|
+
* A "signature" is the node's source text up to (but not including)
|
|
465
|
+
* its body — i.e. up to the first `{` or the first newline, whichever
|
|
466
|
+
* comes first — trimmed and length-capped. That captures
|
|
467
|
+
* `func (s *Server) Start() error`, `def parse(self, text):`,
|
|
468
|
+
* `interface Config`, etc. without any of the body. Pure function of
|
|
469
|
+
* (tree, source) so it is unit-testable without a repo.
|
|
470
|
+
*/
|
|
471
|
+
export function extractSignatures(
|
|
472
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
473
|
+
rootNode, src, defNodes) {
|
|
474
|
+
const out = [];
|
|
475
|
+
const seen = new Set();
|
|
476
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
477
|
+
function visit(node) {
|
|
478
|
+
if (defNodes.has(node.type)) {
|
|
479
|
+
const sig = signatureOf(node, src);
|
|
480
|
+
if (sig && !seen.has(sig)) {
|
|
481
|
+
seen.add(sig);
|
|
482
|
+
out.push(sig);
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
486
|
+
visit(node.child(i));
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
visit(rootNode);
|
|
490
|
+
return out;
|
|
491
|
+
}
|
|
492
|
+
// A line that is purely a C# attribute (`[Obsolete]`), Rust attribute
|
|
493
|
+
// (`#[derive(...)]`), or Java/Python annotation/decorator (`@Override`,
|
|
494
|
+
// `@staticmethod`). The whole-line anchors (`^…$`) matter: they keep an
|
|
495
|
+
// inline form like `[Foo] public void Bar()` from being mistaken for a
|
|
496
|
+
// metadata-only line and skipped.
|
|
497
|
+
const ATTRIBUTE_LINE_RE = /^(#?\[.*\]|@[\w.]+(\s*\([^)]*\))?)$/;
|
|
498
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
499
|
+
function signatureOf(node, src) {
|
|
500
|
+
const full = src.slice(node.startIndex, node.endIndex);
|
|
501
|
+
// A definition node often spans leading attribute/annotation lines
|
|
502
|
+
// before the actual declaration (pervasive in C#, common in Java,
|
|
503
|
+
// always-separate-line in Python). Drop those leading lines so the
|
|
504
|
+
// signature is the declaration itself, not `[DebuggerStepThrough]`.
|
|
505
|
+
const lines = full.split("\n");
|
|
506
|
+
let start = 0;
|
|
507
|
+
while (start < lines.length - 1) {
|
|
508
|
+
const t = lines[start].trim();
|
|
509
|
+
if (t.length === 0 || ATTRIBUTE_LINE_RE.test(t))
|
|
510
|
+
start += 1;
|
|
511
|
+
else
|
|
512
|
+
break;
|
|
513
|
+
}
|
|
514
|
+
const rest = lines.slice(start).join("\n");
|
|
515
|
+
// Cut at the body: first `{` or first newline, whichever is first.
|
|
516
|
+
let cut = rest.length;
|
|
517
|
+
const brace = rest.indexOf("{");
|
|
518
|
+
const nl = rest.indexOf("\n");
|
|
519
|
+
if (brace >= 0)
|
|
520
|
+
cut = Math.min(cut, brace);
|
|
521
|
+
if (nl >= 0)
|
|
522
|
+
cut = Math.min(cut, nl);
|
|
523
|
+
let sig = rest.slice(0, cut).trim();
|
|
524
|
+
// Python defs end with `:` — keep it; collapse internal whitespace.
|
|
525
|
+
sig = sig.replace(/\s+/g, " ");
|
|
526
|
+
if (sig.length > 140)
|
|
527
|
+
sig = sig.slice(0, 137) + "…";
|
|
528
|
+
return sig.length > 0 ? sig : null;
|
|
529
|
+
}
|
|
530
|
+
/* ─── structural extractors for non-code formats ────────────────────── */
|
|
531
|
+
/**
|
|
532
|
+
* JSON "shape": the TOP-LEVEL keys only (or a marker if the root is an
|
|
533
|
+
* array / scalar). A whole-tree walk would emit every nested key,
|
|
534
|
+
* which is noise — so this descends exactly one object level. Pure
|
|
535
|
+
* function of (tree, source); unit-testable without a repo.
|
|
536
|
+
*/
|
|
537
|
+
export function extractJsonShape(
|
|
538
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
539
|
+
rootNode, src) {
|
|
540
|
+
// Find the root value node: tree-sitter-json wraps it in `document`.
|
|
541
|
+
let root = rootNode;
|
|
542
|
+
if (root && root.type === "document" && root.childCount > 0) {
|
|
543
|
+
// first non-comment child
|
|
544
|
+
for (let i = 0; i < root.childCount; i++) {
|
|
545
|
+
const c = root.child(i);
|
|
546
|
+
if (c && c.type !== "comment") {
|
|
547
|
+
root = c;
|
|
548
|
+
break;
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
if (!root)
|
|
553
|
+
return [];
|
|
554
|
+
if (root.type === "array")
|
|
555
|
+
return ["[root is a JSON array]"];
|
|
556
|
+
if (root.type !== "object")
|
|
557
|
+
return [`[root is a JSON ${root.type}]`];
|
|
558
|
+
const keys = [];
|
|
559
|
+
for (let i = 0; i < root.childCount; i++) {
|
|
560
|
+
const pair = root.child(i);
|
|
561
|
+
if (!pair || pair.type !== "pair")
|
|
562
|
+
continue;
|
|
563
|
+
// a `pair` is `key : value`; the key is the first `string` child.
|
|
564
|
+
let keyText = null;
|
|
565
|
+
for (let j = 0; j < pair.childCount; j++) {
|
|
566
|
+
const k = pair.child(j);
|
|
567
|
+
if (k && k.type === "string") {
|
|
568
|
+
keyText = src.slice(k.startIndex, k.endIndex).replace(/^["']|["']$/g, "");
|
|
569
|
+
break;
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
if (keyText)
|
|
573
|
+
keys.push(keyText);
|
|
574
|
+
}
|
|
575
|
+
return keys;
|
|
576
|
+
}
|
|
577
|
+
/**
|
|
578
|
+
* HTML "skeleton": elements that carry an `id` (rendered `tag#id`) and
|
|
579
|
+
* the major structural landmark tags (`header`, `main`, `nav`, `form`,
|
|
580
|
+
* `section`, …). A flat list of every element would be noise; this is
|
|
581
|
+
* the part of the document an agent actually navigates by. Pure
|
|
582
|
+
* function of (tree, source).
|
|
583
|
+
*/
|
|
584
|
+
const HTML_LANDMARK_TAGS = new Set([
|
|
585
|
+
"header", "footer", "main", "nav", "aside", "section", "article",
|
|
586
|
+
"form", "table", "dialog", "template",
|
|
587
|
+
]);
|
|
588
|
+
export function extractHtmlSkeleton(
|
|
589
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
590
|
+
rootNode, src) {
|
|
591
|
+
const out = [];
|
|
592
|
+
const seen = new Set();
|
|
593
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
594
|
+
function visit(node) {
|
|
595
|
+
if (node.type === "element" || node.type === "script_element" || node.type === "style_element") {
|
|
596
|
+
// The start tag holds the tag name and attributes.
|
|
597
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
598
|
+
let startTag = null;
|
|
599
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
600
|
+
const c = node.child(i);
|
|
601
|
+
if (c && (c.type === "start_tag" || c.type === "self_closing_tag")) {
|
|
602
|
+
startTag = c;
|
|
603
|
+
break;
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
if (startTag) {
|
|
607
|
+
let tagName = "";
|
|
608
|
+
let idValue = null;
|
|
609
|
+
for (let i = 0; i < startTag.childCount; i++) {
|
|
610
|
+
const c = startTag.child(i);
|
|
611
|
+
if (!c)
|
|
612
|
+
continue;
|
|
613
|
+
if (c.type === "tag_name") {
|
|
614
|
+
tagName = src.slice(c.startIndex, c.endIndex);
|
|
615
|
+
}
|
|
616
|
+
else if (c.type === "attribute") {
|
|
617
|
+
// attribute = attribute_name [= (quoted_)attribute_value]
|
|
618
|
+
let attrName = "";
|
|
619
|
+
let attrVal = "";
|
|
620
|
+
for (let j = 0; j < c.childCount; j++) {
|
|
621
|
+
const a = c.child(j);
|
|
622
|
+
if (!a)
|
|
623
|
+
continue;
|
|
624
|
+
if (a.type === "attribute_name") {
|
|
625
|
+
attrName = src.slice(a.startIndex, a.endIndex);
|
|
626
|
+
}
|
|
627
|
+
else if (a.type === "quoted_attribute_value" || a.type === "attribute_value") {
|
|
628
|
+
attrVal = src.slice(a.startIndex, a.endIndex).replace(/^["']|["']$/g, "");
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
if (attrName.toLowerCase() === "id" && attrVal)
|
|
632
|
+
idValue = attrVal;
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
if (tagName) {
|
|
636
|
+
let entry = null;
|
|
637
|
+
if (idValue)
|
|
638
|
+
entry = `${tagName}#${idValue}`;
|
|
639
|
+
else if (HTML_LANDMARK_TAGS.has(tagName.toLowerCase()))
|
|
640
|
+
entry = `<${tagName}>`;
|
|
641
|
+
if (entry && !seen.has(entry)) {
|
|
642
|
+
seen.add(entry);
|
|
643
|
+
out.push(entry);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
for (let i = 0; i < node.childCount; i++)
|
|
649
|
+
visit(node.child(i));
|
|
650
|
+
}
|
|
651
|
+
visit(rootNode);
|
|
652
|
+
return out;
|
|
653
|
+
}
|
|
654
|
+
/* ─── grammar path resolution ───────────────────────────────────────── */
|
|
655
|
+
/**
|
|
656
|
+
* Locate the vendored `grammars/` directory. Callers can pass the
|
|
657
|
+
* package directory explicitly; otherwise we resolve it relative to
|
|
658
|
+
* this compiled module — `dist/ingest/code-map.js` → `../../grammars`.
|
|
659
|
+
*/
|
|
660
|
+
function resolveGrammarsDir(packageDir) {
|
|
661
|
+
if (packageDir)
|
|
662
|
+
return join(packageDir, "grammars");
|
|
663
|
+
try {
|
|
664
|
+
const here = fileURLToPath(import.meta.url); // .../dist/ingest/code-map.js
|
|
665
|
+
return join(here, "..", "..", "..", "grammars");
|
|
666
|
+
}
|
|
667
|
+
catch {
|
|
668
|
+
return join(process.cwd(), "grammars");
|
|
669
|
+
}
|
|
670
|
+
}
|