opencode-diane 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +180 -0
- package/LICENSE +21 -0
- package/README.md +206 -0
- package/WIKI.md +1430 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.js +1632 -0
- package/dist/ingest/adaptive.d.ts +47 -0
- package/dist/ingest/adaptive.js +182 -0
- package/dist/ingest/code-health.d.ts +58 -0
- package/dist/ingest/code-health.js +202 -0
- package/dist/ingest/code-map.d.ts +71 -0
- package/dist/ingest/code-map.js +670 -0
- package/dist/ingest/cross-refs.d.ts +59 -0
- package/dist/ingest/cross-refs.js +1207 -0
- package/dist/ingest/docs.d.ts +49 -0
- package/dist/ingest/docs.js +325 -0
- package/dist/ingest/git.d.ts +77 -0
- package/dist/ingest/git.js +390 -0
- package/dist/ingest/live-session.d.ts +101 -0
- package/dist/ingest/live-session.js +173 -0
- package/dist/ingest/project-notes.d.ts +28 -0
- package/dist/ingest/project-notes.js +102 -0
- package/dist/ingest/project.d.ts +35 -0
- package/dist/ingest/project.js +430 -0
- package/dist/ingest/session-snapshot.d.ts +63 -0
- package/dist/ingest/session-snapshot.js +94 -0
- package/dist/ingest/sessions.d.ts +29 -0
- package/dist/ingest/sessions.js +164 -0
- package/dist/ingest/tables.d.ts +52 -0
- package/dist/ingest/tables.js +360 -0
- package/dist/mining/skill-miner.d.ts +53 -0
- package/dist/mining/skill-miner.js +234 -0
- package/dist/search/bm25.d.ts +81 -0
- package/dist/search/bm25.js +334 -0
- package/dist/search/e5-embedder.d.ts +30 -0
- package/dist/search/e5-embedder.js +91 -0
- package/dist/search/embed-pass.d.ts +26 -0
- package/dist/search/embed-pass.js +43 -0
- package/dist/search/embedder.d.ts +58 -0
- package/dist/search/embedder.js +85 -0
- package/dist/search/inverted-index.d.ts +51 -0
- package/dist/search/inverted-index.js +139 -0
- package/dist/search/ppr.d.ts +44 -0
- package/dist/search/ppr.js +118 -0
- package/dist/search/tokenize.d.ts +26 -0
- package/dist/search/tokenize.js +98 -0
- package/dist/store/eviction.d.ts +16 -0
- package/dist/store/eviction.js +37 -0
- package/dist/store/repository.d.ts +222 -0
- package/dist/store/repository.js +420 -0
- package/dist/store/sqlite-store.d.ts +89 -0
- package/dist/store/sqlite-store.js +252 -0
- package/dist/store/vector-store.d.ts +66 -0
- package/dist/store/vector-store.js +160 -0
- package/dist/types.d.ts +385 -0
- package/dist/types.js +9 -0
- package/dist/utils/file-log.d.ts +87 -0
- package/dist/utils/file-log.js +215 -0
- package/dist/utils/peer-detection.d.ts +45 -0
- package/dist/utils/peer-detection.js +90 -0
- package/dist/utils/shell.d.ts +43 -0
- package/dist/utils/shell.js +110 -0
- package/dist/utils/usage-skill.d.ts +42 -0
- package/dist/utils/usage-skill.js +129 -0
- package/dist/utils/xlsx.d.ts +36 -0
- package/dist/utils/xlsx.js +270 -0
- package/grammars/tree-sitter-c.wasm +0 -0
- package/grammars/tree-sitter-c_sharp.wasm +0 -0
- package/grammars/tree-sitter-cpp.wasm +0 -0
- package/grammars/tree-sitter-css.wasm +0 -0
- package/grammars/tree-sitter-go.wasm +0 -0
- package/grammars/tree-sitter-html.wasm +0 -0
- package/grammars/tree-sitter-java.wasm +0 -0
- package/grammars/tree-sitter-javascript.wasm +0 -0
- package/grammars/tree-sitter-json.wasm +0 -0
- package/grammars/tree-sitter-php.wasm +0 -0
- package/grammars/tree-sitter-python.wasm +0 -0
- package/grammars/tree-sitter-rust.wasm +0 -0
- package/grammars/tree-sitter-typescript.wasm +0 -0
- package/package.json +80 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Session snapshots — branchable, versioned "understanding" carried
|
|
3
|
+
* across sessions.
|
|
4
|
+
*
|
|
5
|
+
* The `session-trace` category already records what a *past* session
|
|
6
|
+
* physically did (files edited, commands run). A snapshot records
|
|
7
|
+
* something different and harder-won: the *understanding* a session
|
|
8
|
+
* built up — the mental model, the decisions made, the conventions
|
|
9
|
+
* learned — the stuff that is normally lost when a context window
|
|
10
|
+
* fills and compacts.
|
|
11
|
+
*
|
|
12
|
+
* This is the harness-side, no-model translation of the
|
|
13
|
+
* "contextual memory virtualisation" idea: instead of a DAG data
|
|
14
|
+
* structure, each snapshot is one pinned memory, and the parent link
|
|
15
|
+
* is just a `parent:<id>` tag. The set of snapshots and their parent
|
|
16
|
+
* tags *is* the DAG — readable, hand-editable, no new storage shape.
|
|
17
|
+
*
|
|
18
|
+
* - A later session resumes from the most recent snapshot.
|
|
19
|
+
* - A parallel session reads the same shared store, so it forks
|
|
20
|
+
* from the same point automatically.
|
|
21
|
+
* - Recording a new snapshot that tags an older one as `parent`
|
|
22
|
+
* is a branch.
|
|
23
|
+
*
|
|
24
|
+
* Snapshots are pinned, so the LFU disk-budget eviction never drops
|
|
25
|
+
* them — accumulated understanding outlives transient facts.
|
|
26
|
+
*/
|
|
27
|
+
const CATEGORY = "session-snapshot";
|
|
28
|
+
/**
|
|
29
|
+
* Record a session snapshot. `sessionId` keys it; if a snapshot for
|
|
30
|
+
* the same session already exists it is replaced (a session's
|
|
31
|
+
* understanding is updated in place, not duplicated). The most recent
|
|
32
|
+
* *other* session's snapshot is recorded as the `parent` — that link
|
|
33
|
+
* is what makes the snapshot set a branchable history.
|
|
34
|
+
*/
|
|
35
|
+
export function writeSnapshot(repo, sessionId, input) {
|
|
36
|
+
const parentId = latestSnapshotId(repo, sessionId);
|
|
37
|
+
const lines = [input.summary.trim()];
|
|
38
|
+
if (input.decisions && input.decisions.length > 0) {
|
|
39
|
+
lines.push("Decisions: " + input.decisions.map((d) => d.trim()).filter(Boolean).join(" | "));
|
|
40
|
+
}
|
|
41
|
+
if (input.conventions && input.conventions.length > 0) {
|
|
42
|
+
lines.push("Conventions: " + input.conventions.map((c) => c.trim()).filter(Boolean).join(" | "));
|
|
43
|
+
}
|
|
44
|
+
const content = `Session understanding (${sessionId}): ` + lines.join(". ");
|
|
45
|
+
const tags = ["session-snapshot", `session:${sessionId}`];
|
|
46
|
+
if (parentId)
|
|
47
|
+
tags.push(`parent:${parentId}`);
|
|
48
|
+
// upsertBySubject → one snapshot per session, replace-in-place.
|
|
49
|
+
const mem = repo.upsertBySubject({
|
|
50
|
+
category: CATEGORY,
|
|
51
|
+
subject: `snapshot:${sessionId}`,
|
|
52
|
+
content,
|
|
53
|
+
tags,
|
|
54
|
+
source: `session:${sessionId}`,
|
|
55
|
+
pinned: true, // accumulated understanding must outlive eviction
|
|
56
|
+
});
|
|
57
|
+
return { id: mem.id, parentId };
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* The most recent snapshot to resume from — the newest snapshot that
|
|
61
|
+
* does NOT belong to `excludeSessionId` (so a session never resumes
|
|
62
|
+
* from itself). Returns null when there are no prior snapshots.
|
|
63
|
+
*/
|
|
64
|
+
export function latestSnapshot(repo, excludeSessionId) {
|
|
65
|
+
let best = null;
|
|
66
|
+
for (const m of repo.allMemories()) {
|
|
67
|
+
if (m.category !== CATEGORY)
|
|
68
|
+
continue;
|
|
69
|
+
if (excludeSessionId && m.subject === `snapshot:${excludeSessionId}`)
|
|
70
|
+
continue;
|
|
71
|
+
if (!best || m.createdAt > best.createdAt)
|
|
72
|
+
best = m;
|
|
73
|
+
}
|
|
74
|
+
return best;
|
|
75
|
+
}
|
|
76
|
+
function latestSnapshotId(repo, excludeSessionId) {
|
|
77
|
+
return latestSnapshot(repo, excludeSessionId)?.id ?? null;
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* A compact, human-readable lineage for `memory_status` / logs:
|
|
81
|
+
* how many snapshots exist and when the most recent was taken.
|
|
82
|
+
*/
|
|
83
|
+
export function snapshotSummary(repo) {
|
|
84
|
+
let count = 0;
|
|
85
|
+
let latestAt = null;
|
|
86
|
+
for (const m of repo.allMemories()) {
|
|
87
|
+
if (m.category !== CATEGORY)
|
|
88
|
+
continue;
|
|
89
|
+
count += 1;
|
|
90
|
+
if (latestAt === null || m.createdAt > latestAt)
|
|
91
|
+
latestAt = m.createdAt;
|
|
92
|
+
}
|
|
93
|
+
return { count, latestAt };
|
|
94
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Past-session ingestion.
|
|
3
|
+
*
|
|
4
|
+
* Pulls user-task + tool-trace summaries from previous OpenCode
|
|
5
|
+
* sessions in the same project, via the SDK client that the plugin
|
|
6
|
+
* receives in its context. Sessions live in OpenCode's own SQLite
|
|
7
|
+
* store; we read them through the documented client API rather than
|
|
8
|
+
* touching the DB file.
|
|
9
|
+
*
|
|
10
|
+
* Without an LLM, we extract two kinds of facts per session:
|
|
11
|
+
* 1) The user's first message ("the task").
|
|
12
|
+
* 2) The set of distinct file paths the agent edited/wrote and
|
|
13
|
+
* bash commands it ran ("the trace").
|
|
14
|
+
*
|
|
15
|
+
* One memory per (sessionId, kind) tuple. Re-ingesting the same
|
|
16
|
+
* session is idempotent thanks to insertIfMissing.
|
|
17
|
+
*
|
|
18
|
+
* Defensive: every SDK call is wrapped — different OpenCode versions
|
|
19
|
+
* expose slightly different methods (session.list / session.messages)
|
|
20
|
+
* and the plugin must keep working when one is absent.
|
|
21
|
+
*/
|
|
22
|
+
import type { MemoryRepository } from "../store/repository.js";
|
|
23
|
+
export interface SessionIngestResult {
|
|
24
|
+
sessions: number;
|
|
25
|
+
taskMemories: number;
|
|
26
|
+
traceMemories: number;
|
|
27
|
+
errors: string[];
|
|
28
|
+
}
|
|
29
|
+
export declare function ingestSessions(repo: MemoryRepository, client: unknown, currentSessionId?: string): Promise<SessionIngestResult>;
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Past-session ingestion.
|
|
3
|
+
*
|
|
4
|
+
* Pulls user-task + tool-trace summaries from previous OpenCode
|
|
5
|
+
* sessions in the same project, via the SDK client that the plugin
|
|
6
|
+
* receives in its context. Sessions live in OpenCode's own SQLite
|
|
7
|
+
* store; we read them through the documented client API rather than
|
|
8
|
+
* touching the DB file.
|
|
9
|
+
*
|
|
10
|
+
* Without an LLM, we extract two kinds of facts per session:
|
|
11
|
+
* 1) The user's first message ("the task").
|
|
12
|
+
* 2) The set of distinct file paths the agent edited/wrote and
|
|
13
|
+
* bash commands it ran ("the trace").
|
|
14
|
+
*
|
|
15
|
+
* One memory per (sessionId, kind) tuple. Re-ingesting the same
|
|
16
|
+
* session is idempotent thanks to insertIfMissing.
|
|
17
|
+
*
|
|
18
|
+
* Defensive: every SDK call is wrapped — different OpenCode versions
|
|
19
|
+
* expose slightly different methods (session.list / session.messages)
|
|
20
|
+
* and the plugin must keep working when one is absent.
|
|
21
|
+
*/
|
|
22
|
+
const CATEGORY = "session-trace";
|
|
23
|
+
export async function ingestSessions(repo, client, currentSessionId) {
|
|
24
|
+
const result = {
|
|
25
|
+
sessions: 0,
|
|
26
|
+
taskMemories: 0,
|
|
27
|
+
traceMemories: 0,
|
|
28
|
+
errors: [],
|
|
29
|
+
};
|
|
30
|
+
const sessions = await safeSessionList(client);
|
|
31
|
+
if (!sessions) {
|
|
32
|
+
result.errors.push("SDK session.list unavailable");
|
|
33
|
+
return result;
|
|
34
|
+
}
|
|
35
|
+
for (const s of sessions) {
|
|
36
|
+
if (!s.id || s.id === currentSessionId)
|
|
37
|
+
continue;
|
|
38
|
+
result.sessions += 1;
|
|
39
|
+
const messages = await safeSessionMessages(client, s.id);
|
|
40
|
+
if (!messages)
|
|
41
|
+
continue;
|
|
42
|
+
const firstUser = messages.find((m) => m.role === "user");
|
|
43
|
+
if (firstUser) {
|
|
44
|
+
const taskText = extractText(firstUser);
|
|
45
|
+
if (taskText) {
|
|
46
|
+
repo.insertIfMissing({
|
|
47
|
+
category: CATEGORY,
|
|
48
|
+
subject: `task:${s.id}`,
|
|
49
|
+
content: `Task in past session "${s.title ?? s.id}": ${truncate(taskText, 320)}`,
|
|
50
|
+
tags: ["task", `session:${s.id}`],
|
|
51
|
+
source: `session:${s.id}`,
|
|
52
|
+
});
|
|
53
|
+
result.taskMemories += 1;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
const trace = summarizeTrace(messages);
|
|
57
|
+
if (trace) {
|
|
58
|
+
repo.insertIfMissing({
|
|
59
|
+
category: CATEGORY,
|
|
60
|
+
subject: `trace:${s.id}`,
|
|
61
|
+
content: trace,
|
|
62
|
+
tags: ["trace", `session:${s.id}`],
|
|
63
|
+
source: `session:${s.id}`,
|
|
64
|
+
});
|
|
65
|
+
result.traceMemories += 1;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
repo.setIngestedAt(CATEGORY, Date.now());
|
|
69
|
+
return result;
|
|
70
|
+
}
|
|
71
|
+
async function safeSessionList(client) {
|
|
72
|
+
const c = client;
|
|
73
|
+
if (!c?.session?.list)
|
|
74
|
+
return null;
|
|
75
|
+
try {
|
|
76
|
+
const res = (await c.session.list({}));
|
|
77
|
+
if (Array.isArray(res))
|
|
78
|
+
return res;
|
|
79
|
+
if (res && Array.isArray(res.data)) {
|
|
80
|
+
return res.data;
|
|
81
|
+
}
|
|
82
|
+
return null;
|
|
83
|
+
}
|
|
84
|
+
catch {
|
|
85
|
+
return null;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
async function safeSessionMessages(client, sessionId) {
|
|
89
|
+
const c = client;
|
|
90
|
+
if (!c?.session?.messages)
|
|
91
|
+
return null;
|
|
92
|
+
try {
|
|
93
|
+
const res = (await c.session.messages({ path: { id: sessionId } }));
|
|
94
|
+
if (Array.isArray(res))
|
|
95
|
+
return res;
|
|
96
|
+
if (res && Array.isArray(res.data)) {
|
|
97
|
+
return res.data;
|
|
98
|
+
}
|
|
99
|
+
return null;
|
|
100
|
+
}
|
|
101
|
+
catch {
|
|
102
|
+
return null;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
/* ─── trace extraction ─────────────────────────────────────────────── */
|
|
106
|
+
function extractText(m) {
|
|
107
|
+
if (typeof m.content === "string")
|
|
108
|
+
return m.content;
|
|
109
|
+
if (Array.isArray(m.parts)) {
|
|
110
|
+
const out = [];
|
|
111
|
+
for (const p of m.parts) {
|
|
112
|
+
if (typeof p === "string")
|
|
113
|
+
out.push(p);
|
|
114
|
+
else if (p && typeof p === "object" &&
|
|
115
|
+
typeof p.text === "string") {
|
|
116
|
+
out.push(p.text);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return out.join(" ");
|
|
120
|
+
}
|
|
121
|
+
return "";
|
|
122
|
+
}
|
|
123
|
+
function summarizeTrace(messages) {
|
|
124
|
+
const files = new Set();
|
|
125
|
+
const bashCmds = [];
|
|
126
|
+
for (const m of messages) {
|
|
127
|
+
if (!Array.isArray(m.parts))
|
|
128
|
+
continue;
|
|
129
|
+
for (const p of m.parts) {
|
|
130
|
+
if (!p || typeof p !== "object")
|
|
131
|
+
continue;
|
|
132
|
+
const obj = p;
|
|
133
|
+
const toolName = obj.tool ??
|
|
134
|
+
(obj.metadata?.tool) ??
|
|
135
|
+
"";
|
|
136
|
+
const args = (obj.args ?? obj.input ?? {});
|
|
137
|
+
if (toolName === "edit" || toolName === "write" || toolName === "multiedit" || toolName === "create") {
|
|
138
|
+
const fp = args.filePath ??
|
|
139
|
+
args.path ??
|
|
140
|
+
args.file_path;
|
|
141
|
+
if (fp)
|
|
142
|
+
files.add(fp);
|
|
143
|
+
}
|
|
144
|
+
if (toolName === "bash") {
|
|
145
|
+
const cmd = args.command;
|
|
146
|
+
if (cmd)
|
|
147
|
+
bashCmds.push(truncate(cmd, 80));
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
if (files.size === 0 && bashCmds.length === 0)
|
|
152
|
+
return null;
|
|
153
|
+
const parts = [];
|
|
154
|
+
if (files.size > 0) {
|
|
155
|
+
parts.push(`edited files: ${Array.from(files).slice(0, 12).join(", ")}`);
|
|
156
|
+
}
|
|
157
|
+
if (bashCmds.length > 0) {
|
|
158
|
+
parts.push(`bash commands: ${bashCmds.slice(0, 6).join(" | ")}`);
|
|
159
|
+
}
|
|
160
|
+
return parts.join(". ");
|
|
161
|
+
}
|
|
162
|
+
function truncate(s, n) {
|
|
163
|
+
return s.length <= n ? s : s.slice(0, n - 1) + "…";
|
|
164
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tables.ts — ingest the column headers of tabular files.
|
|
3
|
+
*
|
|
4
|
+
* The premise: data files in a repo have structural value (the column
|
|
5
|
+
* names tell the agent what's in the table — "id, email, signup_date,
|
|
6
|
+
* plan_tier" is enough to know `users.csv` is the user table) without
|
|
7
|
+
* the row data being useful for recall (a million rows of values
|
|
8
|
+
* would just bloat the BM25 index).
|
|
9
|
+
*
|
|
10
|
+
* Header-only ingestion is the right slice: high signal, bounded
|
|
11
|
+
* cost, and the agent can always read the full file on demand via
|
|
12
|
+
* OpenCode's `read` tool when it actually needs row data.
|
|
13
|
+
*
|
|
14
|
+
* **Scope:**
|
|
15
|
+
* - `.csv` and `.tsv` — first-line parse, no dependency, never loads
|
|
16
|
+
* more than the first 64 KB of the file.
|
|
17
|
+
* - `.xlsx`, `.xls`, `.xlsm` — handled via SheetJS (the `xlsx` npm
|
|
18
|
+
* package), **lazily imported** only when a spreadsheet is
|
|
19
|
+
* actually encountered, so repos with no spreadsheets never pay
|
|
20
|
+
* the ~5 MB module-load cost. Each sheet becomes its own memory.
|
|
21
|
+
* - Walks the project tree with a generous file cap and the same
|
|
22
|
+
* SKIP_DIRS the other ingesters use.
|
|
23
|
+
*
|
|
24
|
+
* **CSV parsing.** A small inline parser handles quoted fields,
|
|
25
|
+
* embedded commas, escaped quotes, and CRLF line endings. Pulling in
|
|
26
|
+
* a CSV dep for this is not justified.
|
|
27
|
+
*
|
|
28
|
+
* **XLSX safety.** SheetJS is invoked with macros, formulas, and
|
|
29
|
+
* styles disabled — we only need cell values from row 1 of each
|
|
30
|
+
* sheet, nothing else. This significantly reduces the surface a
|
|
31
|
+
* hostile workbook could present.
|
|
32
|
+
*/
|
|
33
|
+
import type { MemoryRepository } from "../store/repository.js";
|
|
34
|
+
export interface TablesIngestOptions {
|
|
35
|
+
maxFiles?: number;
|
|
36
|
+
maxXlsxMB?: number;
|
|
37
|
+
maxColumns?: number;
|
|
38
|
+
}
|
|
39
|
+
export interface TablesIngestResult {
|
|
40
|
+
filesFound: number;
|
|
41
|
+
/**
|
|
42
|
+
* Subset of the formats this pass actually covered. CSV/TSV are
|
|
43
|
+
* always supported. XLSX/XLS appear here only when at least one
|
|
44
|
+
* spreadsheet was found AND SheetJS was successfully loaded; if
|
|
45
|
+
* the dependency is missing at runtime the result reports an
|
|
46
|
+
* `xlsxUnavailableReason` and spreadsheets are silently skipped.
|
|
47
|
+
*/
|
|
48
|
+
formatsSupported: ReadonlyArray<string>;
|
|
49
|
+
/** Set if a spreadsheet was found but SheetJS could not be loaded. */
|
|
50
|
+
xlsxUnavailableReason?: string;
|
|
51
|
+
}
|
|
52
|
+
export declare function ingestTableHeaders(repo: MemoryRepository, root: string, opts?: TablesIngestOptions): Promise<TablesIngestResult>;
|
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tables.ts — ingest the column headers of tabular files.
|
|
3
|
+
*
|
|
4
|
+
* The premise: data files in a repo have structural value (the column
|
|
5
|
+
* names tell the agent what's in the table — "id, email, signup_date,
|
|
6
|
+
* plan_tier" is enough to know `users.csv` is the user table) without
|
|
7
|
+
* the row data being useful for recall (a million rows of values
|
|
8
|
+
* would just bloat the BM25 index).
|
|
9
|
+
*
|
|
10
|
+
* Header-only ingestion is the right slice: high signal, bounded
|
|
11
|
+
* cost, and the agent can always read the full file on demand via
|
|
12
|
+
* OpenCode's `read` tool when it actually needs row data.
|
|
13
|
+
*
|
|
14
|
+
* **Scope:**
|
|
15
|
+
* - `.csv` and `.tsv` — first-line parse, no dependency, never loads
|
|
16
|
+
* more than the first 64 KB of the file.
|
|
17
|
+
* - `.xlsx`, `.xls`, `.xlsm` — handled via SheetJS (the `xlsx` npm
|
|
18
|
+
* package), **lazily imported** only when a spreadsheet is
|
|
19
|
+
* actually encountered, so repos with no spreadsheets never pay
|
|
20
|
+
* the ~5 MB module-load cost. Each sheet becomes its own memory.
|
|
21
|
+
* - Walks the project tree with a generous file cap and the same
|
|
22
|
+
* SKIP_DIRS the other ingesters use.
|
|
23
|
+
*
|
|
24
|
+
* **CSV parsing.** A small inline parser handles quoted fields,
|
|
25
|
+
* embedded commas, escaped quotes, and CRLF line endings. Pulling in
|
|
26
|
+
* a CSV dep for this is not justified.
|
|
27
|
+
*
|
|
28
|
+
* **XLSX safety.** SheetJS is invoked with macros, formulas, and
|
|
29
|
+
* styles disabled — we only need cell values from row 1 of each
|
|
30
|
+
* sheet, nothing else. This significantly reduces the surface a
|
|
31
|
+
* hostile workbook could present.
|
|
32
|
+
*/
|
|
33
|
+
import { readdir, open } from "node:fs/promises";
|
|
34
|
+
import { join, relative, sep, extname, basename } from "node:path";
|
|
35
|
+
const CATEGORY = "project-facts";
|
|
36
|
+
const SKIP_DIRS = new Set([
|
|
37
|
+
"node_modules",
|
|
38
|
+
".git",
|
|
39
|
+
"dist",
|
|
40
|
+
"build",
|
|
41
|
+
"out",
|
|
42
|
+
"target",
|
|
43
|
+
".next",
|
|
44
|
+
"coverage",
|
|
45
|
+
".cache",
|
|
46
|
+
"vendor",
|
|
47
|
+
]);
|
|
48
|
+
const MAX_FILES = 200;
|
|
49
|
+
const FIRST_LINE_READ_BYTES = 64 * 1024;
|
|
50
|
+
const MAX_XLSX_BYTES = 50 * 1024 * 1024;
|
|
51
|
+
const MAX_COLUMNS_TO_LIST = 40;
|
|
52
|
+
const MAX_SHEETS_PER_WORKBOOK = 20;
|
|
53
|
+
const CSV_EXTS = new Set([".csv", ".tsv"]);
|
|
54
|
+
const XLSX_EXTS = new Set([".xlsx", ".xls", ".xlsm"]);
|
|
55
|
+
export async function ingestTableHeaders(repo, root, opts = {}) {
|
|
56
|
+
const maxFilesLimit = Math.max(1, Math.round(opts.maxFiles ?? MAX_FILES));
|
|
57
|
+
const maxXlsxBytes = Math.max(0, (opts.maxXlsxMB ?? MAX_XLSX_BYTES / (1024 * 1024))) * 1024 * 1024;
|
|
58
|
+
const maxColumnsLimit = Math.max(1, Math.round(opts.maxColumns ?? MAX_COLUMNS_TO_LIST));
|
|
59
|
+
let filesFound = 0;
|
|
60
|
+
let sawSpreadsheet = false;
|
|
61
|
+
let xlsxLoader = null;
|
|
62
|
+
const formats = new Set(["csv", "tsv"]);
|
|
63
|
+
let xlsxUnavailableReason;
|
|
64
|
+
const stack = [root];
|
|
65
|
+
while (stack.length > 0 && filesFound < maxFilesLimit) {
|
|
66
|
+
const dir = stack.pop();
|
|
67
|
+
let entries;
|
|
68
|
+
try {
|
|
69
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
70
|
+
}
|
|
71
|
+
catch {
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
for (const e of entries) {
|
|
75
|
+
if (e.name.startsWith("."))
|
|
76
|
+
continue;
|
|
77
|
+
if (e.isDirectory()) {
|
|
78
|
+
if (!SKIP_DIRS.has(e.name))
|
|
79
|
+
stack.push(join(dir, e.name));
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
if (!e.isFile())
|
|
83
|
+
continue;
|
|
84
|
+
const ext = extname(e.name).toLowerCase();
|
|
85
|
+
const abs = join(dir, e.name);
|
|
86
|
+
const rel = relative(root, abs).split(sep).join("/");
|
|
87
|
+
if (CSV_EXTS.has(ext)) {
|
|
88
|
+
const columns = await readHeaderColumns(abs, ext === ".tsv" ? "\t" : ",");
|
|
89
|
+
if (columns === null)
|
|
90
|
+
continue;
|
|
91
|
+
filesFound += 1;
|
|
92
|
+
emit(repo, rel, ext.slice(1).toUpperCase(), null, columns, maxColumnsLimit);
|
|
93
|
+
}
|
|
94
|
+
else if (XLSX_EXTS.has(ext)) {
|
|
95
|
+
sawSpreadsheet = true;
|
|
96
|
+
// First spreadsheet seen: lazy-import SheetJS. Promise is
|
|
97
|
+
// cached so subsequent files reuse the loaded module
|
|
98
|
+
// without repeated dynamic-import cost.
|
|
99
|
+
if (!xlsxLoader)
|
|
100
|
+
xlsxLoader = loadXlsx();
|
|
101
|
+
const xlsx = await xlsxLoader;
|
|
102
|
+
if ("error" in xlsx) {
|
|
103
|
+
// SheetJS missing or failed to load — skip ALL spreadsheets
|
|
104
|
+
// for this pass and surface the reason. The caller logs
|
|
105
|
+
// once; we don't spam per-file warnings.
|
|
106
|
+
if (!xlsxUnavailableReason)
|
|
107
|
+
xlsxUnavailableReason = xlsx.error;
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
const sheets = await readXlsxSheets(xlsx, abs, maxXlsxBytes);
|
|
111
|
+
if (sheets === null)
|
|
112
|
+
continue;
|
|
113
|
+
filesFound += 1;
|
|
114
|
+
for (const s of sheets) {
|
|
115
|
+
emit(repo, rel, ext.slice(1).toUpperCase(), s.sheetName, s.columns, maxColumnsLimit);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
if (filesFound >= maxFilesLimit)
|
|
122
|
+
break;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (sawSpreadsheet && !xlsxUnavailableReason) {
|
|
126
|
+
formats.add("xlsx");
|
|
127
|
+
formats.add("xls");
|
|
128
|
+
}
|
|
129
|
+
return {
|
|
130
|
+
filesFound,
|
|
131
|
+
formatsSupported: Array.from(formats),
|
|
132
|
+
...(xlsxUnavailableReason ? { xlsxUnavailableReason } : {}),
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
function emit(repo, rel, format, sheetName, columns, maxColumns) {
|
|
136
|
+
const shown = columns.length > maxColumns
|
|
137
|
+
? columns.slice(0, maxColumns).join(", ") + `, … (${columns.length - maxColumns} more)`
|
|
138
|
+
: columns.join(", ");
|
|
139
|
+
const fileTag = basename(rel, extname(rel)).toLowerCase().replace(/[^a-z0-9]+/g, "-");
|
|
140
|
+
// Single-cell files (CSV/TSV) → `table:<path>` (unchanged from v0.0.4
|
|
141
|
+
// first cut). Spreadsheets → `table:<path>#<sheet>` so multi-sheet
|
|
142
|
+
// workbooks become multiple memories with distinct subjects.
|
|
143
|
+
const subject = sheetName ? `table:${rel}#${slugifySheet(sheetName)}` : `table:${rel}`;
|
|
144
|
+
const sheetSuffix = sheetName ? ` sheet "${sheetName}"` : "";
|
|
145
|
+
repo.insertIfMissing({
|
|
146
|
+
category: CATEGORY,
|
|
147
|
+
subject,
|
|
148
|
+
content: `${rel}${sheetSuffix} (${format}, ${columns.length} columns): ${shown}. ` +
|
|
149
|
+
`Read the file directly with OpenCode's read tool for row data.`,
|
|
150
|
+
tags: ["table", "schema", format.toLowerCase(), fileTag, ...(sheetName ? ["sheet"] : [])],
|
|
151
|
+
source: "tables-ingest",
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
function slugifySheet(name) {
|
|
155
|
+
return name.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 40) || "sheet";
|
|
156
|
+
}
|
|
157
|
+
/* ─── CSV / TSV path ────────────────────────────────────────────────── */
|
|
158
|
+
async function readHeaderColumns(abs, delimiter) {
|
|
159
|
+
let handle;
|
|
160
|
+
try {
|
|
161
|
+
handle = await open(abs, "r");
|
|
162
|
+
}
|
|
163
|
+
catch {
|
|
164
|
+
return null;
|
|
165
|
+
}
|
|
166
|
+
try {
|
|
167
|
+
const s = await handle.stat();
|
|
168
|
+
if (!s.isFile() || s.size === 0)
|
|
169
|
+
return null;
|
|
170
|
+
const bytesToRead = Math.min(s.size, FIRST_LINE_READ_BYTES);
|
|
171
|
+
const buf = Buffer.alloc(bytesToRead);
|
|
172
|
+
const { bytesRead } = await handle.read(buf, 0, bytesToRead, 0);
|
|
173
|
+
if (bytesRead === 0)
|
|
174
|
+
return null;
|
|
175
|
+
const text = buf.subarray(0, bytesRead).toString("utf-8");
|
|
176
|
+
if (text.indexOf("\0") >= 0)
|
|
177
|
+
return null;
|
|
178
|
+
const firstLineEnd = findLineTerminator(text);
|
|
179
|
+
const firstLine = firstLineEnd === -1 ? text : text.slice(0, firstLineEnd);
|
|
180
|
+
if (firstLine.length === 0)
|
|
181
|
+
return null;
|
|
182
|
+
const cols = parseDelimitedLine(firstLine, delimiter);
|
|
183
|
+
if (cols.length < 2)
|
|
184
|
+
return null;
|
|
185
|
+
if (cols.some((c) => c.length > 200))
|
|
186
|
+
return null;
|
|
187
|
+
return cols.map((c) => c.trim()).filter((c) => c.length > 0);
|
|
188
|
+
}
|
|
189
|
+
finally {
|
|
190
|
+
await handle.close();
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
function findLineTerminator(s) {
|
|
194
|
+
for (let i = 0; i < s.length; i++) {
|
|
195
|
+
const c = s.charCodeAt(i);
|
|
196
|
+
if (c === 10 || c === 13)
|
|
197
|
+
return i;
|
|
198
|
+
}
|
|
199
|
+
return -1;
|
|
200
|
+
}
|
|
201
|
+
function parseDelimitedLine(line, delimiter) {
|
|
202
|
+
const out = [];
|
|
203
|
+
let cur = "";
|
|
204
|
+
let i = 0;
|
|
205
|
+
while (i < line.length) {
|
|
206
|
+
const c = line[i];
|
|
207
|
+
if (c === '"') {
|
|
208
|
+
i += 1;
|
|
209
|
+
while (i < line.length) {
|
|
210
|
+
const d = line[i];
|
|
211
|
+
if (d === '"') {
|
|
212
|
+
if (line[i + 1] === '"') {
|
|
213
|
+
cur += '"';
|
|
214
|
+
i += 2;
|
|
215
|
+
}
|
|
216
|
+
else {
|
|
217
|
+
i += 1;
|
|
218
|
+
break;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
else {
|
|
222
|
+
cur += d;
|
|
223
|
+
i += 1;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
else if (c === delimiter) {
|
|
228
|
+
out.push(cur);
|
|
229
|
+
cur = "";
|
|
230
|
+
i += 1;
|
|
231
|
+
}
|
|
232
|
+
else {
|
|
233
|
+
cur += c;
|
|
234
|
+
i += 1;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
out.push(cur);
|
|
238
|
+
return out;
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Lazy-load SheetJS. The dynamic import is the whole reason this
|
|
242
|
+
* exists — a repo with no spreadsheets never triggers it, and the
|
|
243
|
+
* ~5 MB module-load cost is amortised across every workbook in the
|
|
244
|
+
* pass once it does. Caller caches the returned promise.
|
|
245
|
+
*
|
|
246
|
+
* If the dependency is missing or fails to load, we return a value
|
|
247
|
+
* with an `error` field — callers degrade to skipping spreadsheets
|
|
248
|
+
* silently rather than crashing the whole ingest.
|
|
249
|
+
*/
|
|
250
|
+
async function loadXlsx() {
|
|
251
|
+
try {
|
|
252
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
253
|
+
const mod = await import("xlsx");
|
|
254
|
+
return mod.default ?? mod;
|
|
255
|
+
}
|
|
256
|
+
catch (err) {
|
|
257
|
+
return { error: `xlsx (SheetJS) could not be loaded: ${err instanceof Error ? err.message : String(err)}` };
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Open a workbook and pull row-1 cell values from every sheet. We
|
|
262
|
+
* read the whole file from disk (SheetJS doesn't stream) but pass
|
|
263
|
+
* read options that disable macros, formulas, styles, and number-
|
|
264
|
+
* format parsing — all of which we don't need and which add to both
|
|
265
|
+
* the work and the attack surface a hostile workbook could present.
|
|
266
|
+
*
|
|
267
|
+
* Returns null on any failure (missing file, parse error, oversized,
|
|
268
|
+
* etc.) — same contract as the CSV path: a problematic file is
|
|
269
|
+
* silently skipped, never fatal.
|
|
270
|
+
*/
|
|
271
|
+
async function readXlsxSheets(xlsx, abs, maxBytes = MAX_XLSX_BYTES) {
|
|
272
|
+
let handle;
|
|
273
|
+
try {
|
|
274
|
+
handle = await open(abs, "r");
|
|
275
|
+
}
|
|
276
|
+
catch {
|
|
277
|
+
return null;
|
|
278
|
+
}
|
|
279
|
+
try {
|
|
280
|
+
const s = await handle.stat();
|
|
281
|
+
if (!s.isFile() || s.size === 0 || s.size > maxBytes)
|
|
282
|
+
return null;
|
|
283
|
+
}
|
|
284
|
+
finally {
|
|
285
|
+
await handle.close();
|
|
286
|
+
}
|
|
287
|
+
let wb;
|
|
288
|
+
try {
|
|
289
|
+
wb = xlsx.readFile(abs, {
|
|
290
|
+
// Minimum-surface read: we only want cell values from row 1.
|
|
291
|
+
cellFormula: false,
|
|
292
|
+
cellHTML: false,
|
|
293
|
+
cellNF: false,
|
|
294
|
+
cellStyles: false,
|
|
295
|
+
cellText: false,
|
|
296
|
+
cellDates: false,
|
|
297
|
+
bookVBA: false,
|
|
298
|
+
bookFiles: false,
|
|
299
|
+
bookProps: false,
|
|
300
|
+
bookSheets: false,
|
|
301
|
+
});
|
|
302
|
+
}
|
|
303
|
+
catch {
|
|
304
|
+
return null;
|
|
305
|
+
}
|
|
306
|
+
const sheets = [];
|
|
307
|
+
const names = Array.isArray(wb?.SheetNames) ? wb.SheetNames.slice(0, MAX_SHEETS_PER_WORKBOOK) : [];
|
|
308
|
+
for (const name of names) {
|
|
309
|
+
const ws = wb.Sheets?.[name];
|
|
310
|
+
if (!ws)
|
|
311
|
+
continue;
|
|
312
|
+
const cols = extractHeaderRowFromSheet(xlsx, ws);
|
|
313
|
+
if (cols.length < 2)
|
|
314
|
+
continue;
|
|
315
|
+
sheets.push({ sheetName: String(name), columns: cols });
|
|
316
|
+
}
|
|
317
|
+
return sheets.length > 0 ? sheets : null;
|
|
318
|
+
}
|
|
319
|
+
/**
|
|
320
|
+
* Pull cell values from row 1 of one worksheet by walking the sheet's
|
|
321
|
+
* declared range. We deliberately read cells directly (`ws[A1]`,
|
|
322
|
+
* `ws[B1]`, …) rather than `sheet_to_json` so the full row block
|
|
323
|
+
* past row 1 is never materialised. A sheet with 1 M rows costs the
|
|
324
|
+
* same as a sheet with 10.
|
|
325
|
+
*/
|
|
326
|
+
function extractHeaderRowFromSheet(xlsx, ws) {
|
|
327
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
328
|
+
const sheet = ws;
|
|
329
|
+
const ref = typeof sheet["!ref"] === "string" ? sheet["!ref"] : null;
|
|
330
|
+
if (!ref)
|
|
331
|
+
return [];
|
|
332
|
+
let range;
|
|
333
|
+
try {
|
|
334
|
+
range = xlsx.utils.decode_range(ref);
|
|
335
|
+
}
|
|
336
|
+
catch {
|
|
337
|
+
return [];
|
|
338
|
+
}
|
|
339
|
+
if (!range || !range.s || !range.e)
|
|
340
|
+
return [];
|
|
341
|
+
const row = range.s.r;
|
|
342
|
+
const out = [];
|
|
343
|
+
for (let c = range.s.c; c <= range.e.c; c++) {
|
|
344
|
+
if (out.length >= 200)
|
|
345
|
+
break;
|
|
346
|
+
const addr = xlsx.utils.encode_cell({ r: row, c });
|
|
347
|
+
const cell = sheet[addr];
|
|
348
|
+
const v = cell?.v;
|
|
349
|
+
if (v === undefined || v === null) {
|
|
350
|
+
out.push("");
|
|
351
|
+
continue;
|
|
352
|
+
}
|
|
353
|
+
out.push(String(v));
|
|
354
|
+
}
|
|
355
|
+
// Drop trailing empty cells — common when a sheet has stray cells
|
|
356
|
+
// far to the right of the real table.
|
|
357
|
+
while (out.length > 0 && out[out.length - 1] === "")
|
|
358
|
+
out.pop();
|
|
359
|
+
return out.map((c) => c.trim()).filter((c) => c.length > 0);
|
|
360
|
+
}
|