opencode-diane 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +180 -0
  2. package/LICENSE +21 -0
  3. package/README.md +206 -0
  4. package/WIKI.md +1430 -0
  5. package/dist/index.d.ts +28 -0
  6. package/dist/index.js +1632 -0
  7. package/dist/ingest/adaptive.d.ts +47 -0
  8. package/dist/ingest/adaptive.js +182 -0
  9. package/dist/ingest/code-health.d.ts +58 -0
  10. package/dist/ingest/code-health.js +202 -0
  11. package/dist/ingest/code-map.d.ts +71 -0
  12. package/dist/ingest/code-map.js +670 -0
  13. package/dist/ingest/cross-refs.d.ts +59 -0
  14. package/dist/ingest/cross-refs.js +1207 -0
  15. package/dist/ingest/docs.d.ts +49 -0
  16. package/dist/ingest/docs.js +325 -0
  17. package/dist/ingest/git.d.ts +77 -0
  18. package/dist/ingest/git.js +390 -0
  19. package/dist/ingest/live-session.d.ts +101 -0
  20. package/dist/ingest/live-session.js +173 -0
  21. package/dist/ingest/project-notes.d.ts +28 -0
  22. package/dist/ingest/project-notes.js +102 -0
  23. package/dist/ingest/project.d.ts +35 -0
  24. package/dist/ingest/project.js +430 -0
  25. package/dist/ingest/session-snapshot.d.ts +63 -0
  26. package/dist/ingest/session-snapshot.js +94 -0
  27. package/dist/ingest/sessions.d.ts +29 -0
  28. package/dist/ingest/sessions.js +164 -0
  29. package/dist/ingest/tables.d.ts +52 -0
  30. package/dist/ingest/tables.js +360 -0
  31. package/dist/mining/skill-miner.d.ts +53 -0
  32. package/dist/mining/skill-miner.js +234 -0
  33. package/dist/search/bm25.d.ts +81 -0
  34. package/dist/search/bm25.js +334 -0
  35. package/dist/search/e5-embedder.d.ts +30 -0
  36. package/dist/search/e5-embedder.js +91 -0
  37. package/dist/search/embed-pass.d.ts +26 -0
  38. package/dist/search/embed-pass.js +43 -0
  39. package/dist/search/embedder.d.ts +58 -0
  40. package/dist/search/embedder.js +85 -0
  41. package/dist/search/inverted-index.d.ts +51 -0
  42. package/dist/search/inverted-index.js +139 -0
  43. package/dist/search/ppr.d.ts +44 -0
  44. package/dist/search/ppr.js +118 -0
  45. package/dist/search/tokenize.d.ts +26 -0
  46. package/dist/search/tokenize.js +98 -0
  47. package/dist/store/eviction.d.ts +16 -0
  48. package/dist/store/eviction.js +37 -0
  49. package/dist/store/repository.d.ts +222 -0
  50. package/dist/store/repository.js +420 -0
  51. package/dist/store/sqlite-store.d.ts +89 -0
  52. package/dist/store/sqlite-store.js +252 -0
  53. package/dist/store/vector-store.d.ts +66 -0
  54. package/dist/store/vector-store.js +160 -0
  55. package/dist/types.d.ts +385 -0
  56. package/dist/types.js +9 -0
  57. package/dist/utils/file-log.d.ts +87 -0
  58. package/dist/utils/file-log.js +215 -0
  59. package/dist/utils/peer-detection.d.ts +45 -0
  60. package/dist/utils/peer-detection.js +90 -0
  61. package/dist/utils/shell.d.ts +43 -0
  62. package/dist/utils/shell.js +110 -0
  63. package/dist/utils/usage-skill.d.ts +42 -0
  64. package/dist/utils/usage-skill.js +129 -0
  65. package/dist/utils/xlsx.d.ts +36 -0
  66. package/dist/utils/xlsx.js +270 -0
  67. package/grammars/tree-sitter-c.wasm +0 -0
  68. package/grammars/tree-sitter-c_sharp.wasm +0 -0
  69. package/grammars/tree-sitter-cpp.wasm +0 -0
  70. package/grammars/tree-sitter-css.wasm +0 -0
  71. package/grammars/tree-sitter-go.wasm +0 -0
  72. package/grammars/tree-sitter-html.wasm +0 -0
  73. package/grammars/tree-sitter-java.wasm +0 -0
  74. package/grammars/tree-sitter-javascript.wasm +0 -0
  75. package/grammars/tree-sitter-json.wasm +0 -0
  76. package/grammars/tree-sitter-php.wasm +0 -0
  77. package/grammars/tree-sitter-python.wasm +0 -0
  78. package/grammars/tree-sitter-rust.wasm +0 -0
  79. package/grammars/tree-sitter-typescript.wasm +0 -0
  80. package/package.json +80 -0
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Session snapshots — branchable, versioned "understanding" carried
3
+ * across sessions.
4
+ *
5
+ * The `session-trace` category already records what a *past* session
6
+ * physically did (files edited, commands run). A snapshot records
7
+ * something different and harder-won: the *understanding* a session
8
+ * built up — the mental model, the decisions made, the conventions
9
+ * learned — the stuff that is normally lost when a context window
10
+ * fills and compacts.
11
+ *
12
+ * This is the harness-side, no-model translation of the
13
+ * "contextual memory virtualisation" idea: instead of a DAG data
14
+ * structure, each snapshot is one pinned memory, and the parent link
15
+ * is just a `parent:<id>` tag. The set of snapshots and their parent
16
+ * tags *is* the DAG — readable, hand-editable, no new storage shape.
17
+ *
18
+ * - A later session resumes from the most recent snapshot.
19
+ * - A parallel session reads the same shared store, so it forks
20
+ * from the same point automatically.
21
+ * - Recording a new snapshot that tags an older one as `parent`
22
+ * is a branch.
23
+ *
24
+ * Snapshots are pinned, so the LFU disk-budget eviction never drops
25
+ * them — accumulated understanding outlives transient facts.
26
+ */
27
+ const CATEGORY = "session-snapshot";
28
+ /**
29
+ * Record a session snapshot. `sessionId` keys it; if a snapshot for
30
+ * the same session already exists it is replaced (a session's
31
+ * understanding is updated in place, not duplicated). The most recent
32
+ * *other* session's snapshot is recorded as the `parent` — that link
33
+ * is what makes the snapshot set a branchable history.
34
+ */
35
+ export function writeSnapshot(repo, sessionId, input) {
36
+ const parentId = latestSnapshotId(repo, sessionId);
37
+ const lines = [input.summary.trim()];
38
+ if (input.decisions && input.decisions.length > 0) {
39
+ lines.push("Decisions: " + input.decisions.map((d) => d.trim()).filter(Boolean).join(" | "));
40
+ }
41
+ if (input.conventions && input.conventions.length > 0) {
42
+ lines.push("Conventions: " + input.conventions.map((c) => c.trim()).filter(Boolean).join(" | "));
43
+ }
44
+ const content = `Session understanding (${sessionId}): ` + lines.join(". ");
45
+ const tags = ["session-snapshot", `session:${sessionId}`];
46
+ if (parentId)
47
+ tags.push(`parent:${parentId}`);
48
+ // upsertBySubject → one snapshot per session, replace-in-place.
49
+ const mem = repo.upsertBySubject({
50
+ category: CATEGORY,
51
+ subject: `snapshot:${sessionId}`,
52
+ content,
53
+ tags,
54
+ source: `session:${sessionId}`,
55
+ pinned: true, // accumulated understanding must outlive eviction
56
+ });
57
+ return { id: mem.id, parentId };
58
+ }
59
+ /**
60
+ * The most recent snapshot to resume from — the newest snapshot that
61
+ * does NOT belong to `excludeSessionId` (so a session never resumes
62
+ * from itself). Returns null when there are no prior snapshots.
63
+ */
64
+ export function latestSnapshot(repo, excludeSessionId) {
65
+ let best = null;
66
+ for (const m of repo.allMemories()) {
67
+ if (m.category !== CATEGORY)
68
+ continue;
69
+ if (excludeSessionId && m.subject === `snapshot:${excludeSessionId}`)
70
+ continue;
71
+ if (!best || m.createdAt > best.createdAt)
72
+ best = m;
73
+ }
74
+ return best;
75
+ }
76
+ function latestSnapshotId(repo, excludeSessionId) {
77
+ return latestSnapshot(repo, excludeSessionId)?.id ?? null;
78
+ }
79
+ /**
80
+ * A compact, human-readable lineage for `memory_status` / logs:
81
+ * how many snapshots exist and when the most recent was taken.
82
+ */
83
+ export function snapshotSummary(repo) {
84
+ let count = 0;
85
+ let latestAt = null;
86
+ for (const m of repo.allMemories()) {
87
+ if (m.category !== CATEGORY)
88
+ continue;
89
+ count += 1;
90
+ if (latestAt === null || m.createdAt > latestAt)
91
+ latestAt = m.createdAt;
92
+ }
93
+ return { count, latestAt };
94
+ }
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Past-session ingestion.
3
+ *
4
+ * Pulls user-task + tool-trace summaries from previous OpenCode
5
+ * sessions in the same project, via the SDK client that the plugin
6
+ * receives in its context. Sessions live in OpenCode's own SQLite
7
+ * store; we read them through the documented client API rather than
8
+ * touching the DB file.
9
+ *
10
+ * Without an LLM, we extract two kinds of facts per session:
11
+ * 1) The user's first message ("the task").
12
+ * 2) The set of distinct file paths the agent edited/wrote and
13
+ * bash commands it ran ("the trace").
14
+ *
15
+ * One memory per (sessionId, kind) tuple. Re-ingesting the same
16
+ * session is idempotent thanks to insertIfMissing.
17
+ *
18
+ * Defensive: every SDK call is wrapped — different OpenCode versions
19
+ * expose slightly different methods (session.list / session.messages)
20
+ * and the plugin must keep working when one is absent.
21
+ */
22
+ import type { MemoryRepository } from "../store/repository.js";
23
+ export interface SessionIngestResult {
24
+ sessions: number;
25
+ taskMemories: number;
26
+ traceMemories: number;
27
+ errors: string[];
28
+ }
29
+ export declare function ingestSessions(repo: MemoryRepository, client: unknown, currentSessionId?: string): Promise<SessionIngestResult>;
@@ -0,0 +1,164 @@
1
+ /**
2
+ * Past-session ingestion.
3
+ *
4
+ * Pulls user-task + tool-trace summaries from previous OpenCode
5
+ * sessions in the same project, via the SDK client that the plugin
6
+ * receives in its context. Sessions live in OpenCode's own SQLite
7
+ * store; we read them through the documented client API rather than
8
+ * touching the DB file.
9
+ *
10
+ * Without an LLM, we extract two kinds of facts per session:
11
+ * 1) The user's first message ("the task").
12
+ * 2) The set of distinct file paths the agent edited/wrote and
13
+ * bash commands it ran ("the trace").
14
+ *
15
+ * One memory per (sessionId, kind) tuple. Re-ingesting the same
16
+ * session is idempotent thanks to insertIfMissing.
17
+ *
18
+ * Defensive: every SDK call is wrapped — different OpenCode versions
19
+ * expose slightly different methods (session.list / session.messages)
20
+ * and the plugin must keep working when one is absent.
21
+ */
22
+ const CATEGORY = "session-trace";
23
+ export async function ingestSessions(repo, client, currentSessionId) {
24
+ const result = {
25
+ sessions: 0,
26
+ taskMemories: 0,
27
+ traceMemories: 0,
28
+ errors: [],
29
+ };
30
+ const sessions = await safeSessionList(client);
31
+ if (!sessions) {
32
+ result.errors.push("SDK session.list unavailable");
33
+ return result;
34
+ }
35
+ for (const s of sessions) {
36
+ if (!s.id || s.id === currentSessionId)
37
+ continue;
38
+ result.sessions += 1;
39
+ const messages = await safeSessionMessages(client, s.id);
40
+ if (!messages)
41
+ continue;
42
+ const firstUser = messages.find((m) => m.role === "user");
43
+ if (firstUser) {
44
+ const taskText = extractText(firstUser);
45
+ if (taskText) {
46
+ repo.insertIfMissing({
47
+ category: CATEGORY,
48
+ subject: `task:${s.id}`,
49
+ content: `Task in past session "${s.title ?? s.id}": ${truncate(taskText, 320)}`,
50
+ tags: ["task", `session:${s.id}`],
51
+ source: `session:${s.id}`,
52
+ });
53
+ result.taskMemories += 1;
54
+ }
55
+ }
56
+ const trace = summarizeTrace(messages);
57
+ if (trace) {
58
+ repo.insertIfMissing({
59
+ category: CATEGORY,
60
+ subject: `trace:${s.id}`,
61
+ content: trace,
62
+ tags: ["trace", `session:${s.id}`],
63
+ source: `session:${s.id}`,
64
+ });
65
+ result.traceMemories += 1;
66
+ }
67
+ }
68
+ repo.setIngestedAt(CATEGORY, Date.now());
69
+ return result;
70
+ }
71
+ async function safeSessionList(client) {
72
+ const c = client;
73
+ if (!c?.session?.list)
74
+ return null;
75
+ try {
76
+ const res = (await c.session.list({}));
77
+ if (Array.isArray(res))
78
+ return res;
79
+ if (res && Array.isArray(res.data)) {
80
+ return res.data;
81
+ }
82
+ return null;
83
+ }
84
+ catch {
85
+ return null;
86
+ }
87
+ }
88
+ async function safeSessionMessages(client, sessionId) {
89
+ const c = client;
90
+ if (!c?.session?.messages)
91
+ return null;
92
+ try {
93
+ const res = (await c.session.messages({ path: { id: sessionId } }));
94
+ if (Array.isArray(res))
95
+ return res;
96
+ if (res && Array.isArray(res.data)) {
97
+ return res.data;
98
+ }
99
+ return null;
100
+ }
101
+ catch {
102
+ return null;
103
+ }
104
+ }
105
+ /* ─── trace extraction ─────────────────────────────────────────────── */
106
+ function extractText(m) {
107
+ if (typeof m.content === "string")
108
+ return m.content;
109
+ if (Array.isArray(m.parts)) {
110
+ const out = [];
111
+ for (const p of m.parts) {
112
+ if (typeof p === "string")
113
+ out.push(p);
114
+ else if (p && typeof p === "object" &&
115
+ typeof p.text === "string") {
116
+ out.push(p.text);
117
+ }
118
+ }
119
+ return out.join(" ");
120
+ }
121
+ return "";
122
+ }
123
+ function summarizeTrace(messages) {
124
+ const files = new Set();
125
+ const bashCmds = [];
126
+ for (const m of messages) {
127
+ if (!Array.isArray(m.parts))
128
+ continue;
129
+ for (const p of m.parts) {
130
+ if (!p || typeof p !== "object")
131
+ continue;
132
+ const obj = p;
133
+ const toolName = obj.tool ??
134
+ (obj.metadata?.tool) ??
135
+ "";
136
+ const args = (obj.args ?? obj.input ?? {});
137
+ if (toolName === "edit" || toolName === "write" || toolName === "multiedit" || toolName === "create") {
138
+ const fp = args.filePath ??
139
+ args.path ??
140
+ args.file_path;
141
+ if (fp)
142
+ files.add(fp);
143
+ }
144
+ if (toolName === "bash") {
145
+ const cmd = args.command;
146
+ if (cmd)
147
+ bashCmds.push(truncate(cmd, 80));
148
+ }
149
+ }
150
+ }
151
+ if (files.size === 0 && bashCmds.length === 0)
152
+ return null;
153
+ const parts = [];
154
+ if (files.size > 0) {
155
+ parts.push(`edited files: ${Array.from(files).slice(0, 12).join(", ")}`);
156
+ }
157
+ if (bashCmds.length > 0) {
158
+ parts.push(`bash commands: ${bashCmds.slice(0, 6).join(" | ")}`);
159
+ }
160
+ return parts.join(". ");
161
+ }
162
+ function truncate(s, n) {
163
+ return s.length <= n ? s : s.slice(0, n - 1) + "…";
164
+ }
@@ -0,0 +1,52 @@
1
+ /**
2
+ * tables.ts — ingest the column headers of tabular files.
3
+ *
4
+ * The premise: data files in a repo have structural value (the column
5
+ * names tell the agent what's in the table — "id, email, signup_date,
6
+ * plan_tier" is enough to know `users.csv` is the user table) without
7
+ * the row data being useful for recall (a million rows of values
8
+ * would just bloat the BM25 index).
9
+ *
10
+ * Header-only ingestion is the right slice: high signal, bounded
11
+ * cost, and the agent can always read the full file on demand via
12
+ * OpenCode's `read` tool when it actually needs row data.
13
+ *
14
+ * **Scope:**
15
+ * - `.csv` and `.tsv` — first-line parse, no dependency, never loads
16
+ * more than the first 64 KB of the file.
17
+ * - `.xlsx`, `.xls`, `.xlsm` — handled via SheetJS (the `xlsx` npm
18
+ * package), **lazily imported** only when a spreadsheet is
19
+ * actually encountered, so repos with no spreadsheets never pay
20
+ * the ~5 MB module-load cost. Each sheet becomes its own memory.
21
+ * - Walks the project tree with a generous file cap and the same
22
+ * SKIP_DIRS the other ingesters use.
23
+ *
24
+ * **CSV parsing.** A small inline parser handles quoted fields,
25
+ * embedded commas, escaped quotes, and CRLF line endings. Pulling in
26
+ * a CSV dep for this is not justified.
27
+ *
28
+ * **XLSX safety.** SheetJS is invoked with macros, formulas, and
29
+ * styles disabled — we only need cell values from row 1 of each
30
+ * sheet, nothing else. This significantly reduces the surface a
31
+ * hostile workbook could present.
32
+ */
33
+ import type { MemoryRepository } from "../store/repository.js";
34
+ export interface TablesIngestOptions {
35
+ maxFiles?: number;
36
+ maxXlsxMB?: number;
37
+ maxColumns?: number;
38
+ }
39
+ export interface TablesIngestResult {
40
+ filesFound: number;
41
+ /**
42
+ * Subset of the formats this pass actually covered. CSV/TSV are
43
+ * always supported. XLSX/XLS appear here only when at least one
44
+ * spreadsheet was found AND SheetJS was successfully loaded; if
45
+ * the dependency is missing at runtime the result reports an
46
+ * `xlsxUnavailableReason` and spreadsheets are silently skipped.
47
+ */
48
+ formatsSupported: ReadonlyArray<string>;
49
+ /** Set if a spreadsheet was found but SheetJS could not be loaded. */
50
+ xlsxUnavailableReason?: string;
51
+ }
52
+ export declare function ingestTableHeaders(repo: MemoryRepository, root: string, opts?: TablesIngestOptions): Promise<TablesIngestResult>;
@@ -0,0 +1,360 @@
1
+ /**
2
+ * tables.ts — ingest the column headers of tabular files.
3
+ *
4
+ * The premise: data files in a repo have structural value (the column
5
+ * names tell the agent what's in the table — "id, email, signup_date,
6
+ * plan_tier" is enough to know `users.csv` is the user table) without
7
+ * the row data being useful for recall (a million rows of values
8
+ * would just bloat the BM25 index).
9
+ *
10
+ * Header-only ingestion is the right slice: high signal, bounded
11
+ * cost, and the agent can always read the full file on demand via
12
+ * OpenCode's `read` tool when it actually needs row data.
13
+ *
14
+ * **Scope:**
15
+ * - `.csv` and `.tsv` — first-line parse, no dependency, never loads
16
+ * more than the first 64 KB of the file.
17
+ * - `.xlsx`, `.xls`, `.xlsm` — handled via SheetJS (the `xlsx` npm
18
+ * package), **lazily imported** only when a spreadsheet is
19
+ * actually encountered, so repos with no spreadsheets never pay
20
+ * the ~5 MB module-load cost. Each sheet becomes its own memory.
21
+ * - Walks the project tree with a generous file cap and the same
22
+ * SKIP_DIRS the other ingesters use.
23
+ *
24
+ * **CSV parsing.** A small inline parser handles quoted fields,
25
+ * embedded commas, escaped quotes, and CRLF line endings. Pulling in
26
+ * a CSV dep for this is not justified.
27
+ *
28
+ * **XLSX safety.** SheetJS is invoked with macros, formulas, and
29
+ * styles disabled — we only need cell values from row 1 of each
30
+ * sheet, nothing else. This significantly reduces the surface a
31
+ * hostile workbook could present.
32
+ */
33
+ import { readdir, open } from "node:fs/promises";
34
+ import { join, relative, sep, extname, basename } from "node:path";
35
+ const CATEGORY = "project-facts";
36
+ const SKIP_DIRS = new Set([
37
+ "node_modules",
38
+ ".git",
39
+ "dist",
40
+ "build",
41
+ "out",
42
+ "target",
43
+ ".next",
44
+ "coverage",
45
+ ".cache",
46
+ "vendor",
47
+ ]);
48
+ const MAX_FILES = 200;
49
+ const FIRST_LINE_READ_BYTES = 64 * 1024;
50
+ const MAX_XLSX_BYTES = 50 * 1024 * 1024;
51
+ const MAX_COLUMNS_TO_LIST = 40;
52
+ const MAX_SHEETS_PER_WORKBOOK = 20;
53
+ const CSV_EXTS = new Set([".csv", ".tsv"]);
54
+ const XLSX_EXTS = new Set([".xlsx", ".xls", ".xlsm"]);
55
+ export async function ingestTableHeaders(repo, root, opts = {}) {
56
+ const maxFilesLimit = Math.max(1, Math.round(opts.maxFiles ?? MAX_FILES));
57
+ const maxXlsxBytes = Math.max(0, (opts.maxXlsxMB ?? MAX_XLSX_BYTES / (1024 * 1024))) * 1024 * 1024;
58
+ const maxColumnsLimit = Math.max(1, Math.round(opts.maxColumns ?? MAX_COLUMNS_TO_LIST));
59
+ let filesFound = 0;
60
+ let sawSpreadsheet = false;
61
+ let xlsxLoader = null;
62
+ const formats = new Set(["csv", "tsv"]);
63
+ let xlsxUnavailableReason;
64
+ const stack = [root];
65
+ while (stack.length > 0 && filesFound < maxFilesLimit) {
66
+ const dir = stack.pop();
67
+ let entries;
68
+ try {
69
+ entries = await readdir(dir, { withFileTypes: true });
70
+ }
71
+ catch {
72
+ continue;
73
+ }
74
+ for (const e of entries) {
75
+ if (e.name.startsWith("."))
76
+ continue;
77
+ if (e.isDirectory()) {
78
+ if (!SKIP_DIRS.has(e.name))
79
+ stack.push(join(dir, e.name));
80
+ continue;
81
+ }
82
+ if (!e.isFile())
83
+ continue;
84
+ const ext = extname(e.name).toLowerCase();
85
+ const abs = join(dir, e.name);
86
+ const rel = relative(root, abs).split(sep).join("/");
87
+ if (CSV_EXTS.has(ext)) {
88
+ const columns = await readHeaderColumns(abs, ext === ".tsv" ? "\t" : ",");
89
+ if (columns === null)
90
+ continue;
91
+ filesFound += 1;
92
+ emit(repo, rel, ext.slice(1).toUpperCase(), null, columns, maxColumnsLimit);
93
+ }
94
+ else if (XLSX_EXTS.has(ext)) {
95
+ sawSpreadsheet = true;
96
+ // First spreadsheet seen: lazy-import SheetJS. Promise is
97
+ // cached so subsequent files reuse the loaded module
98
+ // without repeated dynamic-import cost.
99
+ if (!xlsxLoader)
100
+ xlsxLoader = loadXlsx();
101
+ const xlsx = await xlsxLoader;
102
+ if ("error" in xlsx) {
103
+ // SheetJS missing or failed to load — skip ALL spreadsheets
104
+ // for this pass and surface the reason. The caller logs
105
+ // once; we don't spam per-file warnings.
106
+ if (!xlsxUnavailableReason)
107
+ xlsxUnavailableReason = xlsx.error;
108
+ continue;
109
+ }
110
+ const sheets = await readXlsxSheets(xlsx, abs, maxXlsxBytes);
111
+ if (sheets === null)
112
+ continue;
113
+ filesFound += 1;
114
+ for (const s of sheets) {
115
+ emit(repo, rel, ext.slice(1).toUpperCase(), s.sheetName, s.columns, maxColumnsLimit);
116
+ }
117
+ }
118
+ else {
119
+ continue;
120
+ }
121
+ if (filesFound >= maxFilesLimit)
122
+ break;
123
+ }
124
+ }
125
+ if (sawSpreadsheet && !xlsxUnavailableReason) {
126
+ formats.add("xlsx");
127
+ formats.add("xls");
128
+ }
129
+ return {
130
+ filesFound,
131
+ formatsSupported: Array.from(formats),
132
+ ...(xlsxUnavailableReason ? { xlsxUnavailableReason } : {}),
133
+ };
134
+ }
135
+ function emit(repo, rel, format, sheetName, columns, maxColumns) {
136
+ const shown = columns.length > maxColumns
137
+ ? columns.slice(0, maxColumns).join(", ") + `, … (${columns.length - maxColumns} more)`
138
+ : columns.join(", ");
139
+ const fileTag = basename(rel, extname(rel)).toLowerCase().replace(/[^a-z0-9]+/g, "-");
140
+ // Single-cell files (CSV/TSV) → `table:<path>` (unchanged from v0.0.4
141
+ // first cut). Spreadsheets → `table:<path>#<sheet>` so multi-sheet
142
+ // workbooks become multiple memories with distinct subjects.
143
+ const subject = sheetName ? `table:${rel}#${slugifySheet(sheetName)}` : `table:${rel}`;
144
+ const sheetSuffix = sheetName ? ` sheet "${sheetName}"` : "";
145
+ repo.insertIfMissing({
146
+ category: CATEGORY,
147
+ subject,
148
+ content: `${rel}${sheetSuffix} (${format}, ${columns.length} columns): ${shown}. ` +
149
+ `Read the file directly with OpenCode's read tool for row data.`,
150
+ tags: ["table", "schema", format.toLowerCase(), fileTag, ...(sheetName ? ["sheet"] : [])],
151
+ source: "tables-ingest",
152
+ });
153
+ }
154
+ function slugifySheet(name) {
155
+ return name.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 40) || "sheet";
156
+ }
157
+ /* ─── CSV / TSV path ────────────────────────────────────────────────── */
158
+ async function readHeaderColumns(abs, delimiter) {
159
+ let handle;
160
+ try {
161
+ handle = await open(abs, "r");
162
+ }
163
+ catch {
164
+ return null;
165
+ }
166
+ try {
167
+ const s = await handle.stat();
168
+ if (!s.isFile() || s.size === 0)
169
+ return null;
170
+ const bytesToRead = Math.min(s.size, FIRST_LINE_READ_BYTES);
171
+ const buf = Buffer.alloc(bytesToRead);
172
+ const { bytesRead } = await handle.read(buf, 0, bytesToRead, 0);
173
+ if (bytesRead === 0)
174
+ return null;
175
+ const text = buf.subarray(0, bytesRead).toString("utf-8");
176
+ if (text.indexOf("\0") >= 0)
177
+ return null;
178
+ const firstLineEnd = findLineTerminator(text);
179
+ const firstLine = firstLineEnd === -1 ? text : text.slice(0, firstLineEnd);
180
+ if (firstLine.length === 0)
181
+ return null;
182
+ const cols = parseDelimitedLine(firstLine, delimiter);
183
+ if (cols.length < 2)
184
+ return null;
185
+ if (cols.some((c) => c.length > 200))
186
+ return null;
187
+ return cols.map((c) => c.trim()).filter((c) => c.length > 0);
188
+ }
189
+ finally {
190
+ await handle.close();
191
+ }
192
+ }
193
+ function findLineTerminator(s) {
194
+ for (let i = 0; i < s.length; i++) {
195
+ const c = s.charCodeAt(i);
196
+ if (c === 10 || c === 13)
197
+ return i;
198
+ }
199
+ return -1;
200
+ }
201
+ function parseDelimitedLine(line, delimiter) {
202
+ const out = [];
203
+ let cur = "";
204
+ let i = 0;
205
+ while (i < line.length) {
206
+ const c = line[i];
207
+ if (c === '"') {
208
+ i += 1;
209
+ while (i < line.length) {
210
+ const d = line[i];
211
+ if (d === '"') {
212
+ if (line[i + 1] === '"') {
213
+ cur += '"';
214
+ i += 2;
215
+ }
216
+ else {
217
+ i += 1;
218
+ break;
219
+ }
220
+ }
221
+ else {
222
+ cur += d;
223
+ i += 1;
224
+ }
225
+ }
226
+ }
227
+ else if (c === delimiter) {
228
+ out.push(cur);
229
+ cur = "";
230
+ i += 1;
231
+ }
232
+ else {
233
+ cur += c;
234
+ i += 1;
235
+ }
236
+ }
237
+ out.push(cur);
238
+ return out;
239
+ }
240
+ /**
241
+ * Lazy-load SheetJS. The dynamic import is the whole reason this
242
+ * exists — a repo with no spreadsheets never triggers it, and the
243
+ * ~5 MB module-load cost is amortised across every workbook in the
244
+ * pass once it does. Caller caches the returned promise.
245
+ *
246
+ * If the dependency is missing or fails to load, we return a value
247
+ * with an `error` field — callers degrade to skipping spreadsheets
248
+ * silently rather than crashing the whole ingest.
249
+ */
250
+ async function loadXlsx() {
251
+ try {
252
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
253
+ const mod = await import("xlsx");
254
+ return mod.default ?? mod;
255
+ }
256
+ catch (err) {
257
+ return { error: `xlsx (SheetJS) could not be loaded: ${err instanceof Error ? err.message : String(err)}` };
258
+ }
259
+ }
260
+ /**
261
+ * Open a workbook and pull row-1 cell values from every sheet. We
262
+ * read the whole file from disk (SheetJS doesn't stream) but pass
263
+ * read options that disable macros, formulas, styles, and number-
264
+ * format parsing — all of which we don't need and which add to both
265
+ * the work and the attack surface a hostile workbook could present.
266
+ *
267
+ * Returns null on any failure (missing file, parse error, oversized,
268
+ * etc.) — same contract as the CSV path: a problematic file is
269
+ * silently skipped, never fatal.
270
+ */
271
+ async function readXlsxSheets(xlsx, abs, maxBytes = MAX_XLSX_BYTES) {
272
+ let handle;
273
+ try {
274
+ handle = await open(abs, "r");
275
+ }
276
+ catch {
277
+ return null;
278
+ }
279
+ try {
280
+ const s = await handle.stat();
281
+ if (!s.isFile() || s.size === 0 || s.size > maxBytes)
282
+ return null;
283
+ }
284
+ finally {
285
+ await handle.close();
286
+ }
287
+ let wb;
288
+ try {
289
+ wb = xlsx.readFile(abs, {
290
+ // Minimum-surface read: we only want cell values from row 1.
291
+ cellFormula: false,
292
+ cellHTML: false,
293
+ cellNF: false,
294
+ cellStyles: false,
295
+ cellText: false,
296
+ cellDates: false,
297
+ bookVBA: false,
298
+ bookFiles: false,
299
+ bookProps: false,
300
+ bookSheets: false,
301
+ });
302
+ }
303
+ catch {
304
+ return null;
305
+ }
306
+ const sheets = [];
307
+ const names = Array.isArray(wb?.SheetNames) ? wb.SheetNames.slice(0, MAX_SHEETS_PER_WORKBOOK) : [];
308
+ for (const name of names) {
309
+ const ws = wb.Sheets?.[name];
310
+ if (!ws)
311
+ continue;
312
+ const cols = extractHeaderRowFromSheet(xlsx, ws);
313
+ if (cols.length < 2)
314
+ continue;
315
+ sheets.push({ sheetName: String(name), columns: cols });
316
+ }
317
+ return sheets.length > 0 ? sheets : null;
318
+ }
319
+ /**
320
+ * Pull cell values from row 1 of one worksheet by walking the sheet's
321
+ * declared range. We deliberately read cells directly (`ws[A1]`,
322
+ * `ws[B1]`, …) rather than `sheet_to_json` so the full row block
323
+ * past row 1 is never materialised. A sheet with 1 M rows costs the
324
+ * same as a sheet with 10.
325
+ */
326
+ function extractHeaderRowFromSheet(xlsx, ws) {
327
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
328
+ const sheet = ws;
329
+ const ref = typeof sheet["!ref"] === "string" ? sheet["!ref"] : null;
330
+ if (!ref)
331
+ return [];
332
+ let range;
333
+ try {
334
+ range = xlsx.utils.decode_range(ref);
335
+ }
336
+ catch {
337
+ return [];
338
+ }
339
+ if (!range || !range.s || !range.e)
340
+ return [];
341
+ const row = range.s.r;
342
+ const out = [];
343
+ for (let c = range.s.c; c <= range.e.c; c++) {
344
+ if (out.length >= 200)
345
+ break;
346
+ const addr = xlsx.utils.encode_cell({ r: row, c });
347
+ const cell = sheet[addr];
348
+ const v = cell?.v;
349
+ if (v === undefined || v === null) {
350
+ out.push("");
351
+ continue;
352
+ }
353
+ out.push(String(v));
354
+ }
355
+ // Drop trailing empty cells — common when a sheet has stray cells
356
+ // far to the right of the real table.
357
+ while (out.length > 0 && out[out.length - 1] === "")
358
+ out.pop();
359
+ return out.map((c) => c.trim()).filter((c) => c.length > 0);
360
+ }