opencode-diane 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +180 -0
- package/LICENSE +21 -0
- package/README.md +206 -0
- package/WIKI.md +1430 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.js +1632 -0
- package/dist/ingest/adaptive.d.ts +47 -0
- package/dist/ingest/adaptive.js +182 -0
- package/dist/ingest/code-health.d.ts +58 -0
- package/dist/ingest/code-health.js +202 -0
- package/dist/ingest/code-map.d.ts +71 -0
- package/dist/ingest/code-map.js +670 -0
- package/dist/ingest/cross-refs.d.ts +59 -0
- package/dist/ingest/cross-refs.js +1207 -0
- package/dist/ingest/docs.d.ts +49 -0
- package/dist/ingest/docs.js +325 -0
- package/dist/ingest/git.d.ts +77 -0
- package/dist/ingest/git.js +390 -0
- package/dist/ingest/live-session.d.ts +101 -0
- package/dist/ingest/live-session.js +173 -0
- package/dist/ingest/project-notes.d.ts +28 -0
- package/dist/ingest/project-notes.js +102 -0
- package/dist/ingest/project.d.ts +35 -0
- package/dist/ingest/project.js +430 -0
- package/dist/ingest/session-snapshot.d.ts +63 -0
- package/dist/ingest/session-snapshot.js +94 -0
- package/dist/ingest/sessions.d.ts +29 -0
- package/dist/ingest/sessions.js +164 -0
- package/dist/ingest/tables.d.ts +52 -0
- package/dist/ingest/tables.js +360 -0
- package/dist/mining/skill-miner.d.ts +53 -0
- package/dist/mining/skill-miner.js +234 -0
- package/dist/search/bm25.d.ts +81 -0
- package/dist/search/bm25.js +334 -0
- package/dist/search/e5-embedder.d.ts +30 -0
- package/dist/search/e5-embedder.js +91 -0
- package/dist/search/embed-pass.d.ts +26 -0
- package/dist/search/embed-pass.js +43 -0
- package/dist/search/embedder.d.ts +58 -0
- package/dist/search/embedder.js +85 -0
- package/dist/search/inverted-index.d.ts +51 -0
- package/dist/search/inverted-index.js +139 -0
- package/dist/search/ppr.d.ts +44 -0
- package/dist/search/ppr.js +118 -0
- package/dist/search/tokenize.d.ts +26 -0
- package/dist/search/tokenize.js +98 -0
- package/dist/store/eviction.d.ts +16 -0
- package/dist/store/eviction.js +37 -0
- package/dist/store/repository.d.ts +222 -0
- package/dist/store/repository.js +420 -0
- package/dist/store/sqlite-store.d.ts +89 -0
- package/dist/store/sqlite-store.js +252 -0
- package/dist/store/vector-store.d.ts +66 -0
- package/dist/store/vector-store.js +160 -0
- package/dist/types.d.ts +385 -0
- package/dist/types.js +9 -0
- package/dist/utils/file-log.d.ts +87 -0
- package/dist/utils/file-log.js +215 -0
- package/dist/utils/peer-detection.d.ts +45 -0
- package/dist/utils/peer-detection.js +90 -0
- package/dist/utils/shell.d.ts +43 -0
- package/dist/utils/shell.js +110 -0
- package/dist/utils/usage-skill.d.ts +42 -0
- package/dist/utils/usage-skill.js +129 -0
- package/dist/utils/xlsx.d.ts +36 -0
- package/dist/utils/xlsx.js +270 -0
- package/grammars/tree-sitter-c.wasm +0 -0
- package/grammars/tree-sitter-c_sharp.wasm +0 -0
- package/grammars/tree-sitter-cpp.wasm +0 -0
- package/grammars/tree-sitter-css.wasm +0 -0
- package/grammars/tree-sitter-go.wasm +0 -0
- package/grammars/tree-sitter-html.wasm +0 -0
- package/grammars/tree-sitter-java.wasm +0 -0
- package/grammars/tree-sitter-javascript.wasm +0 -0
- package/grammars/tree-sitter-json.wasm +0 -0
- package/grammars/tree-sitter-php.wasm +0 -0
- package/grammars/tree-sitter-python.wasm +0 -0
- package/grammars/tree-sitter-rust.wasm +0 -0
- package/grammars/tree-sitter-typescript.wasm +0 -0
- package/package.json +80 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* docs.ts — ingest project documentation as recallable section
|
|
3
|
+
* pointers.
|
|
4
|
+
*
|
|
5
|
+
* The premise: long-form docs are something the agent can already
|
|
6
|
+
* `read`; what it can't do is *find* the right section to read in a
|
|
7
|
+
* 30-file `docs/` tree without grepping the whole thing first. This
|
|
8
|
+
* ingester emits one memory per heading (H1/H2/H3) with the heading
|
|
9
|
+
* text, the first paragraph of body, and `path:line` so a
|
|
10
|
+
* `memory_recall { query: "installation" }` returns
|
|
11
|
+
*
|
|
12
|
+
* docs/install.md:15 ## Installation
|
|
13
|
+
* This project uses bun. Run `bun install` …
|
|
14
|
+
*
|
|
15
|
+
* — a direct pointer the agent can act on without any directory walk.
|
|
16
|
+
*
|
|
17
|
+
* **Scope** (deliberately conservative — see the design notes at the
|
|
18
|
+
* bottom for what was considered and rejected):
|
|
19
|
+
*
|
|
20
|
+
* - Walks `<root>/docs/` recursively for `*.md` and `*.markdown`.
|
|
21
|
+
* - Adds a fixed set of conventional root-level docs files (the
|
|
22
|
+
* `ROOT_DOCS` list — CHANGELOG, CONTRIBUTING, ARCHITECTURE, …).
|
|
23
|
+
* - Skips README.md — that's handled by the project ingester for
|
|
24
|
+
* the headline paragraph, and full-file ingestion of READMEs
|
|
25
|
+
* would create duplicate entries for the same content.
|
|
26
|
+
* - Skips dotfiles, `node_modules`, `.git`, and the standard
|
|
27
|
+
* SKIP_DIRS used by every other ingester.
|
|
28
|
+
* - Caps headings per file to prevent a runaway TOC, and caps
|
|
29
|
+
* files walked to prevent a runaway monorepo. Both caps are
|
|
30
|
+
* intentionally generous; the goal is "doesn't blow up on a 500-
|
|
31
|
+
* file vendored docs tree" not "hits a tight budget."
|
|
32
|
+
*
|
|
33
|
+
* **Granularity choice — one memory per heading, not per file.**
|
|
34
|
+
* Per-file would mean recall returns "docs/api.md mentions install"
|
|
35
|
+
* — true but useless, the agent still has to read the file. Per-
|
|
36
|
+
* heading returns the SECTION, which is the actionable unit. Costs
|
|
37
|
+
* a small memory-table inflation that's bounded by the cap.
|
|
38
|
+
*/
|
|
39
|
+
import type { MemoryRepository } from "../store/repository.js";
|
|
40
|
+
export interface DocsIngestOptions {
|
|
41
|
+
maxFiles?: number;
|
|
42
|
+
bodyChars?: number;
|
|
43
|
+
maxHeadingLevel?: number;
|
|
44
|
+
}
|
|
45
|
+
export interface DocsIngestResult {
|
|
46
|
+
filesWalked: number;
|
|
47
|
+
headingsIndexed: number;
|
|
48
|
+
}
|
|
49
|
+
export declare function ingestDocs(repo: MemoryRepository, root: string, opts?: DocsIngestOptions): Promise<DocsIngestResult>;
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* docs.ts — ingest project documentation as recallable section
|
|
3
|
+
* pointers.
|
|
4
|
+
*
|
|
5
|
+
* The premise: long-form docs are something the agent can already
|
|
6
|
+
* `read`; what it can't do is *find* the right section to read in a
|
|
7
|
+
* 30-file `docs/` tree without grepping the whole thing first. This
|
|
8
|
+
* ingester emits one memory per heading (H1/H2/H3) with the heading
|
|
9
|
+
* text, the first paragraph of body, and `path:line` so a
|
|
10
|
+
* `memory_recall { query: "installation" }` returns
|
|
11
|
+
*
|
|
12
|
+
* docs/install.md:15 ## Installation
|
|
13
|
+
* This project uses bun. Run `bun install` …
|
|
14
|
+
*
|
|
15
|
+
* — a direct pointer the agent can act on without any directory walk.
|
|
16
|
+
*
|
|
17
|
+
* **Scope** (deliberately conservative — see the design notes at the
|
|
18
|
+
* bottom for what was considered and rejected):
|
|
19
|
+
*
|
|
20
|
+
* - Walks `<root>/docs/` recursively for `*.md` and `*.markdown`.
|
|
21
|
+
* - Adds a fixed set of conventional root-level docs files (the
|
|
22
|
+
* `ROOT_DOCS` list — CHANGELOG, CONTRIBUTING, ARCHITECTURE, …).
|
|
23
|
+
* - Skips README.md — that's handled by the project ingester for
|
|
24
|
+
* the headline paragraph, and full-file ingestion of READMEs
|
|
25
|
+
* would create duplicate entries for the same content.
|
|
26
|
+
* - Skips dotfiles, `node_modules`, `.git`, and the standard
|
|
27
|
+
* SKIP_DIRS used by every other ingester.
|
|
28
|
+
* - Caps headings per file to prevent a runaway TOC, and caps
|
|
29
|
+
* files walked to prevent a runaway monorepo. Both caps are
|
|
30
|
+
* intentionally generous; the goal is "doesn't blow up on a 500-
|
|
31
|
+
* file vendored docs tree" not "hits a tight budget."
|
|
32
|
+
*
|
|
33
|
+
* **Granularity choice — one memory per heading, not per file.**
|
|
34
|
+
* Per-file would mean recall returns "docs/api.md mentions install"
|
|
35
|
+
* — true but useless, the agent still has to read the file. Per-
|
|
36
|
+
* heading returns the SECTION, which is the actionable unit. Costs
|
|
37
|
+
* a small memory-table inflation that's bounded by the cap.
|
|
38
|
+
*/
|
|
39
|
+
import { readdir, readFile, stat } from "node:fs/promises";
|
|
40
|
+
import { join, relative, sep } from "node:path";
|
|
41
|
+
/** Categories existing memories already use; reusing `project-facts`
|
|
42
|
+
* keeps the docs entries discoverable by the same `category` filter
|
|
43
|
+
* the agent reaches for when it wants "facts about this repo." A new
|
|
44
|
+
* category would just split a single mental model into two. */
|
|
45
|
+
const CATEGORY = "project-facts";
|
|
46
|
+
/** Directories the project ingester also skips. Mirrored here so
|
|
47
|
+
* walking a repo's `docs/` doesn't accidentally descend into a
|
|
48
|
+
* vendored copy under `docs/node_modules` etc. */
|
|
49
|
+
const SKIP_DIRS = new Set([
|
|
50
|
+
"node_modules",
|
|
51
|
+
".git",
|
|
52
|
+
"dist",
|
|
53
|
+
"build",
|
|
54
|
+
"out",
|
|
55
|
+
"target",
|
|
56
|
+
".next",
|
|
57
|
+
"coverage",
|
|
58
|
+
".cache",
|
|
59
|
+
"vendor",
|
|
60
|
+
]);
|
|
61
|
+
/** Conventional root-level markdown files that aren't the README and
|
|
62
|
+
* aren't agent-instruction files (those go to project-notes.ts).
|
|
63
|
+
* Each one is checked at the root only — no recursion. */
|
|
64
|
+
const ROOT_DOCS = [
|
|
65
|
+
"CHANGELOG.md",
|
|
66
|
+
"CONTRIBUTING.md",
|
|
67
|
+
"ARCHITECTURE.md",
|
|
68
|
+
"ROADMAP.md",
|
|
69
|
+
"TODO.md",
|
|
70
|
+
"HISTORY.md",
|
|
71
|
+
"NOTES.md",
|
|
72
|
+
"SECURITY.md",
|
|
73
|
+
"GOVERNANCE.md",
|
|
74
|
+
"MAINTAINERS.md",
|
|
75
|
+
"AUTHORS.md",
|
|
76
|
+
"CODE_OF_CONDUCT.md",
|
|
77
|
+
];
|
|
78
|
+
/** Headings deeper than this are typically internal section markers
|
|
79
|
+
* inside a long doc — useful inside the file, but rarely worth a
|
|
80
|
+
* top-level recall pointer. H1/H2/H3 only. */
|
|
81
|
+
const MAX_HEADING_LEVEL = 3;
|
|
82
|
+
/** Per-file cap — most docs have under 30 H1-H3 headings; a file
|
|
83
|
+
* with more is almost certainly auto-generated. */
|
|
84
|
+
const MAX_HEADINGS_PER_FILE = 50;
|
|
85
|
+
/** Hard cap on files walked in one ingest pass. Bounded walk so a
|
|
86
|
+
* 500-file vendored docs tree can't stall startup. */
|
|
87
|
+
const MAX_FILES = 200;
|
|
88
|
+
/** Skip files larger than this — typically auto-generated catalogs
|
|
89
|
+
* with no manually-authored structure worth recalling. */
|
|
90
|
+
const MAX_FILE_BYTES = 256 * 1024;
|
|
91
|
+
/** Bytes of body following a heading to capture as context — enough
|
|
92
|
+
* for one decent paragraph, not so much that 50 headings turn into
|
|
93
|
+
* a wall of duplicated prose. */
|
|
94
|
+
const BODY_CHARS = 240;
|
|
95
|
+
export async function ingestDocs(repo, root, opts = {}) {
|
|
96
|
+
const maxFilesLimit = Math.max(1, Math.round(opts.maxFiles ?? MAX_FILES));
|
|
97
|
+
const bodyCharsLimit = Math.max(40, Math.round(opts.bodyChars ?? BODY_CHARS));
|
|
98
|
+
const maxLevel = Math.min(6, Math.max(1, Math.round(opts.maxHeadingLevel ?? MAX_HEADING_LEVEL)));
|
|
99
|
+
let filesWalked = 0;
|
|
100
|
+
let headingsIndexed = 0;
|
|
101
|
+
const add = (subject, content, tags) => {
|
|
102
|
+
repo.insertIfMissing({
|
|
103
|
+
category: CATEGORY,
|
|
104
|
+
subject,
|
|
105
|
+
content,
|
|
106
|
+
tags,
|
|
107
|
+
source: "docs-ingest",
|
|
108
|
+
});
|
|
109
|
+
headingsIndexed += 1;
|
|
110
|
+
};
|
|
111
|
+
const seen = new Set();
|
|
112
|
+
// ── 1. <root>/docs/ recursive walk ────────────────────────────────
|
|
113
|
+
const docsDir = join(root, "docs");
|
|
114
|
+
if (await isDirectory(docsDir)) {
|
|
115
|
+
const stack = [docsDir];
|
|
116
|
+
while (stack.length > 0 && filesWalked < maxFilesLimit) {
|
|
117
|
+
const dir = stack.pop();
|
|
118
|
+
let entries;
|
|
119
|
+
try {
|
|
120
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
121
|
+
}
|
|
122
|
+
catch {
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
for (const e of entries) {
|
|
126
|
+
if (e.name.startsWith("."))
|
|
127
|
+
continue;
|
|
128
|
+
const abs = join(dir, e.name);
|
|
129
|
+
if (e.isDirectory()) {
|
|
130
|
+
if (!SKIP_DIRS.has(e.name))
|
|
131
|
+
stack.push(abs);
|
|
132
|
+
continue;
|
|
133
|
+
}
|
|
134
|
+
if (!e.isFile())
|
|
135
|
+
continue;
|
|
136
|
+
if (!isMarkdown(e.name))
|
|
137
|
+
continue;
|
|
138
|
+
if (seen.has(abs))
|
|
139
|
+
continue;
|
|
140
|
+
seen.add(abs);
|
|
141
|
+
filesWalked += 1;
|
|
142
|
+
if (filesWalked > maxFilesLimit)
|
|
143
|
+
break;
|
|
144
|
+
await ingestOneFile(abs, root, add, MAX_HEADINGS_PER_FILE, bodyCharsLimit, maxLevel);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
// ── 2. Root-level conventional docs ───────────────────────────────
|
|
149
|
+
for (const name of ROOT_DOCS) {
|
|
150
|
+
if (filesWalked >= maxFilesLimit)
|
|
151
|
+
break;
|
|
152
|
+
const abs = join(root, name);
|
|
153
|
+
if (seen.has(abs))
|
|
154
|
+
continue;
|
|
155
|
+
if (!(await isFile(abs)))
|
|
156
|
+
continue;
|
|
157
|
+
seen.add(abs);
|
|
158
|
+
filesWalked += 1;
|
|
159
|
+
await ingestOneFile(abs, root, add, MAX_HEADINGS_PER_FILE, bodyCharsLimit, maxLevel);
|
|
160
|
+
}
|
|
161
|
+
return { filesWalked, headingsIndexed };
|
|
162
|
+
}
|
|
163
|
+
/** Read one .md file and emit one memory per heading (H1-H3, capped). */
|
|
164
|
+
async function ingestOneFile(abs, root, add, maxHeadings, bodyChars, maxLevel) {
|
|
165
|
+
let raw;
|
|
166
|
+
try {
|
|
167
|
+
const s = await stat(abs);
|
|
168
|
+
if (s.size > MAX_FILE_BYTES)
|
|
169
|
+
return;
|
|
170
|
+
raw = await readFile(abs, "utf-8");
|
|
171
|
+
}
|
|
172
|
+
catch {
|
|
173
|
+
return;
|
|
174
|
+
}
|
|
175
|
+
const rel = relative(root, abs).split(sep).join("/");
|
|
176
|
+
const lines = raw.split("\n");
|
|
177
|
+
const headings = extractHeadings(lines, maxLevel);
|
|
178
|
+
if (headings.length === 0)
|
|
179
|
+
return;
|
|
180
|
+
// Filename without ".md" → tag candidate ("install", "architecture").
|
|
181
|
+
const fileTag = rel.replace(/\.md$|\.markdown$/i, "").replace(/[^a-zA-Z0-9]+/g, "-").toLowerCase();
|
|
182
|
+
let emitted = 0;
|
|
183
|
+
for (const h of headings) {
|
|
184
|
+
if (emitted >= maxHeadings)
|
|
185
|
+
break;
|
|
186
|
+
const body = readFollowingBody(lines, h.lineIdx, headings, bodyChars);
|
|
187
|
+
const slug = toSlug(h.text);
|
|
188
|
+
const subject = `docs:${rel}#${slug || `line-${h.line}`}`;
|
|
189
|
+
const headingTags = h.text
|
|
190
|
+
.toLowerCase()
|
|
191
|
+
.replace(/[^a-z0-9 ]/g, " ")
|
|
192
|
+
.split(/\s+/)
|
|
193
|
+
.filter((w) => w.length > 2)
|
|
194
|
+
.slice(0, 5);
|
|
195
|
+
add(subject,
|
|
196
|
+
// `path:line` so the agent has a precise pointer it can pass
|
|
197
|
+
// to OpenCode's `read` tool, plus the heading and first paragraph
|
|
198
|
+
// so a recall snippet alone often answers the question.
|
|
199
|
+
`${rel}:${h.line} ${"#".repeat(h.level)} ${h.text}` +
|
|
200
|
+
(body ? `\n${body}` : ""), ["docs", "section", fileTag, ...headingTags]);
|
|
201
|
+
emitted += 1;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
/** Parse H1-H3 ATX headings (`# `, `## `, `### `). Setext-style
|
|
205
|
+
* underline headings (`Foo\n===`) are not parsed — rare in modern
|
|
206
|
+
* projects and supporting them isn't worth the parser complexity. */
|
|
207
|
+
function extractHeadings(lines, maxHeadingLevel = MAX_HEADING_LEVEL) {
|
|
208
|
+
const out = [];
|
|
209
|
+
let inFence = false;
|
|
210
|
+
for (let i = 0; i < lines.length; i++) {
|
|
211
|
+
const line = lines[i];
|
|
212
|
+
// Track fenced code blocks so `# comment` inside a code block
|
|
213
|
+
// doesn't get parsed as a heading.
|
|
214
|
+
if (/^\s*```/.test(line)) {
|
|
215
|
+
inFence = !inFence;
|
|
216
|
+
continue;
|
|
217
|
+
}
|
|
218
|
+
if (inFence)
|
|
219
|
+
continue;
|
|
220
|
+
const m = /^(#{1,6})\s+(.+?)\s*#*\s*$/.exec(line);
|
|
221
|
+
if (!m)
|
|
222
|
+
continue;
|
|
223
|
+
const level = m[1].length;
|
|
224
|
+
if (level > maxHeadingLevel)
|
|
225
|
+
continue;
|
|
226
|
+
const text = m[2].trim();
|
|
227
|
+
if (text.length === 0)
|
|
228
|
+
continue;
|
|
229
|
+
out.push({ level, text, line: i + 1, lineIdx: i });
|
|
230
|
+
}
|
|
231
|
+
return out;
|
|
232
|
+
}
|
|
233
|
+
/** Capture the first ~BODY_CHARS of non-empty, non-heading prose
|
|
234
|
+
* following a heading, up to the next heading. Skip code fences
|
|
235
|
+
* (their contents add noise to BM25 without helping the recall
|
|
236
|
+
* signal). Returns "" if no body is present. */
|
|
237
|
+
function readFollowingBody(lines, headingIdx, headings, bodyChars = BODY_CHARS) {
|
|
238
|
+
const nextHeadingIdx = headings.find((h) => h.lineIdx > headingIdx)?.lineIdx ?? lines.length;
|
|
239
|
+
const buf = [];
|
|
240
|
+
let bytes = 0;
|
|
241
|
+
let inFence = false;
|
|
242
|
+
for (let i = headingIdx + 1; i < nextHeadingIdx; i++) {
|
|
243
|
+
const line = lines[i];
|
|
244
|
+
if (/^\s*```/.test(line)) {
|
|
245
|
+
inFence = !inFence;
|
|
246
|
+
continue;
|
|
247
|
+
}
|
|
248
|
+
if (inFence)
|
|
249
|
+
continue;
|
|
250
|
+
const trimmed = line.trim();
|
|
251
|
+
if (trimmed.length === 0) {
|
|
252
|
+
if (buf.length > 0)
|
|
253
|
+
break; // we have one paragraph, stop
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
buf.push(trimmed);
|
|
257
|
+
bytes += trimmed.length + 1;
|
|
258
|
+
if (bytes >= bodyChars)
|
|
259
|
+
break;
|
|
260
|
+
}
|
|
261
|
+
const joined = buf.join(" ");
|
|
262
|
+
return joined.length > bodyChars ? joined.slice(0, bodyChars - 1) + "…" : joined;
|
|
263
|
+
}
|
|
264
|
+
/** Heading → slug for the memory subject. Keeps the memory subject
|
|
265
|
+
* stable across re-runs (a heading edit creates a new subject; the
|
|
266
|
+
* old one decays via the existing pruning rules). */
|
|
267
|
+
function toSlug(s) {
|
|
268
|
+
return s
|
|
269
|
+
.toLowerCase()
|
|
270
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
271
|
+
.replace(/^-+|-+$/g, "")
|
|
272
|
+
.slice(0, 40);
|
|
273
|
+
}
|
|
274
|
+
function isMarkdown(name) {
|
|
275
|
+
// Pure extension check. The ROOT README is implicitly excluded
|
|
276
|
+
// because (a) the recursive walk only descends `<root>/docs/`,
|
|
277
|
+
// never the project root, and (b) the conventional-root-docs list
|
|
278
|
+
// (`ROOT_DOCS`) doesn't list README. A README INSIDE docs/ — like
|
|
279
|
+
// `docs/README.md`, the typical docs-index file — is fine to walk.
|
|
280
|
+
return /\.(md|markdown)$/i.test(name);
|
|
281
|
+
}
|
|
282
|
+
async function isDirectory(p) {
|
|
283
|
+
try {
|
|
284
|
+
const s = await stat(p);
|
|
285
|
+
return s.isDirectory();
|
|
286
|
+
}
|
|
287
|
+
catch {
|
|
288
|
+
return false;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
async function isFile(p) {
|
|
292
|
+
try {
|
|
293
|
+
const s = await stat(p);
|
|
294
|
+
return s.isFile();
|
|
295
|
+
}
|
|
296
|
+
catch {
|
|
297
|
+
return false;
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
/* ── Design notes (kept here on purpose) ─────────────────────────────
|
|
301
|
+
*
|
|
302
|
+
* Things considered and rejected:
|
|
303
|
+
*
|
|
304
|
+
* - Walking *all* .md files under `<root>/`: too noisy in repos
|
|
305
|
+
* that vendor docs (e.g. translations of foreign-language READMEs,
|
|
306
|
+
* license texts under packages/). The convention is docs/, and
|
|
307
|
+
* ROOT_DOCS handles the rest.
|
|
308
|
+
*
|
|
309
|
+
* - Indexing the FULL prose of each section: blows up the memory
|
|
310
|
+
* store, and recall already has the path:line pointer for the
|
|
311
|
+
* agent to read the rest on demand. First paragraph is the right
|
|
312
|
+
* balance between "snippet alone often answers" and "doesn't
|
|
313
|
+
* duplicate the whole file."
|
|
314
|
+
*
|
|
315
|
+
* - Tracking heading hierarchy (e.g. "H2 Install > H3 Linux"):
|
|
316
|
+
* would be valuable but doubles the snippet length and changes
|
|
317
|
+
* the recall ranking in ways I don't want to assess without
|
|
318
|
+
* measurement. Punted.
|
|
319
|
+
*
|
|
320
|
+
* - Honouring Markdown links / TOCs: a real outline parser is
|
|
321
|
+
* ~200 lines and the cost-benefit is poor — the heading itself
|
|
322
|
+
* is the actionable token; the link target the agent can grep
|
|
323
|
+
* for if needed.
|
|
324
|
+
* ────────────────────────────────────────────────────────────────────
|
|
325
|
+
*/
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Git history ingestion — fully convention-agnostic.
|
|
3
|
+
*
|
|
4
|
+
* The earlier version classified commits by parsing the *subject
|
|
5
|
+
* line* (conventional commits, bracket tags, gitmoji, English
|
|
6
|
+
* keywords). That is unreliable on real repositories, many of which
|
|
7
|
+
* have no commit-message culture at all ("wip", "fix", ".", "update",
|
|
8
|
+
* non-English text, empty subjects). Message-derived "flavor" was
|
|
9
|
+
* noise dressed up as signal.
|
|
10
|
+
*
|
|
11
|
+
* This version derives everything from STRUCTURE — facts about what
|
|
12
|
+
* the commit physically did, which are true regardless of how (or
|
|
13
|
+
* whether) the author described it:
|
|
14
|
+
*
|
|
15
|
+
* - diff shape : files touched, lines +/-, files created/deleted,
|
|
16
|
+
* net direction. From `git log --numstat --summary`.
|
|
17
|
+
* - co-change : pairs of files modified in the same commit,
|
|
18
|
+
* counted across history (mechanical/huge commits
|
|
19
|
+
* skipped).
|
|
20
|
+
* - churn : how often each file changes — a stability signal.
|
|
21
|
+
* - recency : which files were touched in the most recent
|
|
22
|
+
* commits.
|
|
23
|
+
*
|
|
24
|
+
* The commit subject is still STORED, verbatim, inside the memory
|
|
25
|
+
* content — it is text the agent may legitimately search for — but it
|
|
26
|
+
* never drives tags or categorisation. It is data, not signal.
|
|
27
|
+
*
|
|
28
|
+
* Output is hard-capped by `gitHistoryDepth`. Re-running ingest is
|
|
29
|
+
* idempotent thanks to insertIfMissing on the repository.
|
|
30
|
+
*/
|
|
31
|
+
import type { MemoryRepository } from "../store/repository.js";
|
|
32
|
+
type FileStatus = "created" | "deleted" | "modified";
|
|
33
|
+
interface FileChange {
|
|
34
|
+
path: string;
|
|
35
|
+
added: number;
|
|
36
|
+
deleted: number;
|
|
37
|
+
status: FileStatus;
|
|
38
|
+
}
|
|
39
|
+
export interface GitIngestResult {
|
|
40
|
+
scanned: number;
|
|
41
|
+
commitMemories: number;
|
|
42
|
+
coChangeMemories: number;
|
|
43
|
+
churnMemories: number;
|
|
44
|
+
recencyMemories: number;
|
|
45
|
+
/**
|
|
46
|
+
* Commits skipped for their own per-commit memory because they are
|
|
47
|
+
* balanced churn (additions ≈ deletions — content moved/reformatted,
|
|
48
|
+
* not created). They still feed co-change and churn signals.
|
|
49
|
+
*/
|
|
50
|
+
balancedChurnSkipped: number;
|
|
51
|
+
/** Distribution of structural shape tags across commit memories. */
|
|
52
|
+
shapeTagCounts: Record<string, number>;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* True when a commit is "balanced churn": substantial, and its added
|
|
56
|
+
* line count is within ~8 % of its deleted count. That near-equality
|
|
57
|
+
* is the convention-free fingerprint of content being *moved or
|
|
58
|
+
* reformatted* rather than written — a file rename (with `--no-renames`
|
|
59
|
+
* a rename shows as +N to the new path, -N from the old), a `.rst`→
|
|
60
|
+
* `.md` doc migration, a reformat. Such commits flood keyword recall
|
|
61
|
+
* (they touch keyword-named files) while carrying no logic signal, so
|
|
62
|
+
* they get no per-commit memory — exactly as merge commits don't.
|
|
63
|
+
*
|
|
64
|
+
* Deliberately conservative: the ≥ 25-line floor spares small commits,
|
|
65
|
+
* and 92 % balance is tight enough that a genuine logic change (which
|
|
66
|
+
* almost never lands added ≈ deleted to within 8 %) is not caught.
|
|
67
|
+
* Pure arithmetic on the diff stat — no message parsing, no language
|
|
68
|
+
* or commit-convention assumptions.
|
|
69
|
+
*/
|
|
70
|
+
export declare function isBalancedChurnCommit(files: FileChange[]): boolean;
|
|
71
|
+
export declare function ingestGitHistory(repo: MemoryRepository, root: string, depth: number, coChangeMaxCommits?: number, coChangeMinOccurrences?: number): Promise<GitIngestResult>;
|
|
72
|
+
/**
|
|
73
|
+
* Derive tags purely from what the commit physically did. Every tag
|
|
74
|
+
* here is a fact about the diff, not an interpretation of intent.
|
|
75
|
+
*/
|
|
76
|
+
export declare function deriveShapeTags(files: FileChange[]): string[];
|
|
77
|
+
export {};
|