docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/README.md +242 -0
  2. package/dist/apispec/apispec.js +401 -0
  3. package/dist/apispec/apispec.test.js +444 -0
  4. package/dist/apispec/errors.js +17 -0
  5. package/dist/apispec/index.js +2 -0
  6. package/dist/check/doclinks.js +167 -0
  7. package/dist/check/index.js +8 -0
  8. package/dist/check/run.js +391 -0
  9. package/dist/check/run.test.js +513 -0
  10. package/dist/check/suggest.js +134 -0
  11. package/dist/check/suggest.test.js +92 -0
  12. package/dist/check/tokens.js +125 -0
  13. package/dist/cmd/main.js +330 -0
  14. package/dist/cmd/main.test.js +422 -0
  15. package/dist/codeq/cache.js +71 -0
  16. package/dist/codeq/cache.test.js +67 -0
  17. package/dist/codeq/errors.js +52 -0
  18. package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
  19. package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
  20. package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
  21. package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
  22. package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
  23. package/dist/codeq/index.js +11 -0
  24. package/dist/codeq/resolve.test.js +109 -0
  25. package/dist/codeq/resolver.js +128 -0
  26. package/dist/codeq/resolver.test.js +124 -0
  27. package/dist/codeq/resolvers/go.js +242 -0
  28. package/dist/codeq/resolvers/go.test.js +143 -0
  29. package/dist/codeq/resolvers/java.js +349 -0
  30. package/dist/codeq/resolvers/java.test.js +138 -0
  31. package/dist/codeq/resolvers/java_queries.js +63 -0
  32. package/dist/codeq/resolvers/javascript.js +412 -0
  33. package/dist/codeq/resolvers/javascript.test.js +125 -0
  34. package/dist/codeq/resolvers/javascript_queries.js +46 -0
  35. package/dist/codeq/resolvers/typescript.js +366 -0
  36. package/dist/codeq/resolvers/typescript.test.js +180 -0
  37. package/dist/codeq/resolvers/typescript_queries.js +78 -0
  38. package/dist/codeq/signature.js +50 -0
  39. package/dist/codeq/signature.test.js +50 -0
  40. package/dist/codeq/suggest.js +96 -0
  41. package/dist/codeq/treesitter.js +122 -0
  42. package/dist/codeq/treesitter.test.js +118 -0
  43. package/dist/config/config.js +74 -0
  44. package/dist/config/config.test.js +98 -0
  45. package/dist/config/fs.js +116 -0
  46. package/dist/config/glob.js +82 -0
  47. package/dist/config/glob.test.js +61 -0
  48. package/dist/config/index.js +4 -0
  49. package/dist/dedup/analyzer/analyzer.js +533 -0
  50. package/dist/dedup/analyzer/analyzer.test.js +530 -0
  51. package/dist/dedup/analyzer/canonical.js +74 -0
  52. package/dist/dedup/analyzer/canonical.test.js +70 -0
  53. package/dist/dedup/analyzer/cosine_clusters.js +169 -0
  54. package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
  55. package/dist/dedup/analyzer/distinctive.js +85 -0
  56. package/dist/dedup/analyzer/distinctive.test.js +49 -0
  57. package/dist/dedup/analyzer/exact_clusters.js +63 -0
  58. package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
  59. package/dist/dedup/analyzer/index.js +14 -0
  60. package/dist/dedup/analyzer/multiplicity.js +110 -0
  61. package/dist/dedup/analyzer/multiplicity.test.js +123 -0
  62. package/dist/dedup/analyzer/order.js +22 -0
  63. package/dist/dedup/analyzer/partial_overlaps.js +65 -0
  64. package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
  65. package/dist/dedup/analyzer/preview.js +84 -0
  66. package/dist/dedup/analyzer/preview.test.js +46 -0
  67. package/dist/dedup/analyzer/safety.js +27 -0
  68. package/dist/dedup/analyzer/safety.test.js +39 -0
  69. package/dist/dedup/config.js +18 -0
  70. package/dist/dedup/configload.js +299 -0
  71. package/dist/dedup/configload.test.js +410 -0
  72. package/dist/dedup/dedup.index.test.js +203 -0
  73. package/dist/dedup/dedup.js +143 -0
  74. package/dist/dedup/dedup.test.js +212 -0
  75. package/dist/dedup/dedupcfg/config.js +112 -0
  76. package/dist/dedup/dedupcfg/config.test.js +70 -0
  77. package/dist/dedup/dedupcfg/index.js +1 -0
  78. package/dist/dedup/deduptypes/index.js +1 -0
  79. package/dist/dedup/deduptypes/types.js +9 -0
  80. package/dist/dedup/deduptypes/types.test.js +34 -0
  81. package/dist/dedup/embedder/cache.js +23 -0
  82. package/dist/dedup/embedder/cache.test.js +50 -0
  83. package/dist/dedup/embedder/constants.js +10 -0
  84. package/dist/dedup/embedder/embedder.js +76 -0
  85. package/dist/dedup/embedder/embedder.mock.test.js +128 -0
  86. package/dist/dedup/embedder/embedder.test.js +96 -0
  87. package/dist/dedup/embedder/errors.js +20 -0
  88. package/dist/dedup/embedder/errors.test.js +35 -0
  89. package/dist/dedup/embedder/index.js +4 -0
  90. package/dist/dedup/embedder/session.js +78 -0
  91. package/dist/dedup/embedder/session.test.js +172 -0
  92. package/dist/dedup/gitignore.js +97 -0
  93. package/dist/dedup/gitignore.test.js +98 -0
  94. package/dist/dedup/index.js +11 -0
  95. package/dist/dedup/indexdb/errors.js +48 -0
  96. package/dist/dedup/indexdb/index.js +6 -0
  97. package/dist/dedup/indexdb/indexdb.js +302 -0
  98. package/dist/dedup/indexdb/indexdb.test.js +739 -0
  99. package/dist/dedup/indexdb/load.js +110 -0
  100. package/dist/dedup/indexdb/migrations.js +58 -0
  101. package/dist/dedup/indexdb/schema.js +83 -0
  102. package/dist/dedup/indexer/index.js +9 -0
  103. package/dist/dedup/indexer/indexer.js +501 -0
  104. package/dist/dedup/indexer/indexer.test.js +510 -0
  105. package/dist/dedup/indexer/links.js +89 -0
  106. package/dist/dedup/mdsection/anchor.js +60 -0
  107. package/dist/dedup/mdsection/anchor.test.js +39 -0
  108. package/dist/dedup/mdsection/blocks.js +409 -0
  109. package/dist/dedup/mdsection/blocks.test.js +359 -0
  110. package/dist/dedup/mdsection/index.js +4 -0
  111. package/dist/dedup/mdsection/parse.js +21 -0
  112. package/dist/dedup/mdsection/section.js +234 -0
  113. package/dist/dedup/mdsection/section.test.js +221 -0
  114. package/dist/dedup/report/floatfmt.js +71 -0
  115. package/dist/dedup/report/floatfmt.test.js +42 -0
  116. package/dist/dedup/report/index.js +8 -0
  117. package/dist/dedup/report/quote.js +77 -0
  118. package/dist/dedup/report/quote.test.js +67 -0
  119. package/dist/dedup/report/text.js +251 -0
  120. package/dist/dedup/report/text.test.js +420 -0
  121. package/dist/dedup/report_types.js +8 -0
  122. package/dist/dedup/sectionid/index.js +1 -0
  123. package/dist/dedup/sectionid/sectionid.js +16 -0
  124. package/dist/dedup/sectionid/sectionid.test.js +49 -0
  125. package/dist/guard/api/errors.js +12 -0
  126. package/dist/guard/api/index.js +2 -0
  127. package/dist/guard/api/parser.js +81 -0
  128. package/dist/guard/api/parser.test.js +58 -0
  129. package/dist/guard/api/types.js +1 -0
  130. package/dist/guard/code/errors.js +16 -0
  131. package/dist/guard/code/index.js +2 -0
  132. package/dist/guard/code/parser.js +54 -0
  133. package/dist/guard/code/parser.test.js +111 -0
  134. package/dist/guard/code/types.js +6 -0
  135. package/dist/index.js +1 -0
  136. package/dist/index.test.js +5 -0
  137. package/dist/repo/boundary.js +92 -0
  138. package/dist/repo/boundary.test.js +65 -0
  139. package/dist/repo/errors.js +56 -0
  140. package/dist/repo/errors.test.js +85 -0
  141. package/dist/repo/exists.test.js +72 -0
  142. package/dist/repo/filename.js +46 -0
  143. package/dist/repo/filename.test.js +39 -0
  144. package/dist/repo/fs.js +53 -0
  145. package/dist/repo/index.js +7 -0
  146. package/dist/repo/overlay.js +36 -0
  147. package/dist/repo/overlay.test.js +80 -0
  148. package/dist/repo/repo.js +353 -0
  149. package/dist/repo/repo.test.js +255 -0
  150. package/dist/repo/testutil.js +27 -0
  151. package/dist/repo/write.test.js +125 -0
  152. package/dist/report/color.js +73 -0
  153. package/dist/report/index.js +1 -0
  154. package/dist/report/report.js +112 -0
  155. package/dist/report/report.test.js +368 -0
  156. package/dist/violation/index.js +1 -0
  157. package/dist/violation/types.js +22 -0
  158. package/dist/violation/types.test.js +70 -0
  159. package/package.json +48 -0
@@ -0,0 +1,501 @@
1
+ /**
2
+ * The two-pass dedup indexing pipeline:
3
+ *
4
+ * - Pass 1: walk docs/, extract sections, embed new/changed ones, upsert rows.
5
+ * - Pass 2: scan raw_content for markdown links, update inbound_count on every
6
+ * section.
7
+ *
8
+ * Both passes run inside one transaction (Store.execTx). If anything fails, the
9
+ * entire run rolls back — the on-disk DB stays at the previous state.
10
+ *
11
+ * Ported from internal/dedup/indexer/{indexer.go,links.go}. Reconciling the
12
+ * mixed sync/async stack:
13
+ * - the embedder is async (await emb.embed) — Go took ctx; the TS embedder
14
+ * dropped it, so the Embedder interface here has no ctx parameter;
15
+ * - indexdb (node:sqlite) is synchronous — execTx and all queries run without
16
+ * await inside the transaction callback;
17
+ * - mdsection.extractFromFileWithBlocks is synchronous (readFileSync);
18
+ * - the docs/ walk uses node:fs/promises and is async.
19
+ * So Run is async overall: all embedding (the only async I/O) happens BEFORE the
20
+ * synchronous execTx, exactly mirroring Go where embed batches run before the one
21
+ * EXCLUSIVE transaction in applyChanges.
22
+ *
23
+ * Go ran the corpus walk + per-file parse in an errgroup of workers. node:sqlite
24
+ * and mdsection are synchronous and Run holds a single connection, so there is no
25
+ * concurrency to exploit on the parse side; the walk is a plain async traversal
26
+ * and results are sorted deterministically afterward, matching Go's post-sort.
27
+ */
28
+ import * as fsp from "node:fs/promises";
29
+ import * as path from "node:path";
30
+ import { headingBlacklisted } from "../dedupcfg/index.js";
31
+ import { extractFromFileWithBlocks } from "../mdsection/index.js";
32
+ import { parseLinks } from "./links.js";
33
+ /** hasMaxBatch is the TS analogue of Go's `emb.(MaxBatcher)` type assertion. */
34
+ function hasMaxBatch(emb) {
35
+ return typeof emb.maxBatch === "function";
36
+ }
37
+ /** blockKey is the primary key for a blocks table row, as a "<sid>\x00<idx>" string. */
38
+ function blockKeyOf(sectionID, blockIndex) {
39
+ return `${sectionID}\u0000${blockIndex}`;
40
+ }
41
+ /**
42
+ * Run executes the two-pass indexing pipeline over <repoRoot>/docs/.
43
+ *
44
+ * db must be open and will have all writes applied inside one transaction. emb is
45
+ * borrowed — Run never closes it. progress receives one-line status messages;
46
+ * pass a no-op for silence.
47
+ *
48
+ * On error, the transaction is rolled back and the DB is unchanged.
49
+ */
50
+ export async function run(db, emb, repoRoot, cfg, progress) {
51
+ // Resolve batch size.
52
+ const batchSize = resolveBatchSize(emb, cfg);
53
+ // Pass 1: collect live sections and blocks from the corpus.
54
+ const { sections, blocks: allBlocks } = await collectSectionsAndBlocks(repoRoot, cfg);
55
+ // Load existing sections from the DB (for content_hash comparison).
56
+ const existing = loadExistingSections(db);
57
+ // Determine which sections need new embeddings.
58
+ const toEmbed = [];
59
+ for (const s of sections) {
60
+ const old = existing.get(s.id);
61
+ if (old === undefined || old.contentHash !== s.content_hash) {
62
+ toEmbed.push(s);
63
+ }
64
+ }
65
+ // Embed all new/changed sections; copy existing embeddings for unchanged ones.
66
+ const embeddings = new Map();
67
+ for (const s of sections) {
68
+ const old = existing.get(s.id);
69
+ if (old !== undefined && old.contentHash === s.content_hash) {
70
+ embeddings.set(s.id, old.embedding);
71
+ }
72
+ }
73
+ if (toEmbed.length > 0) {
74
+ progress(`Embedding ${toEmbed.length} sections…\n`);
75
+ for (let i = 0; i < toEmbed.length; i += batchSize) {
76
+ const end = Math.min(i + batchSize, toEmbed.length);
77
+ const batch = toEmbed.slice(i, end);
78
+ const texts = batch.map((s) => s.embed_text);
79
+ const vecs = await emb.embed(texts);
80
+ for (let j = 0; j < batch.length; j++) {
81
+ embeddings.set(batch[j].id, Float32Array.from(vecs[j]));
82
+ }
83
+ }
84
+ }
85
+ // Build inbound link counts (Pass 2) — purely in-memory, over the live sections.
86
+ const inbound = computeInboundCounts(sections, cfg.Indexer.external_url_prefixes);
87
+ // Build the live set of section IDs.
88
+ const liveIDs = new Set();
89
+ for (const s of sections) {
90
+ liveIDs.add(s.id);
91
+ }
92
+ // Determine sections to prune (in DB but not in live set).
93
+ const toPrune = [];
94
+ for (const id of existing.keys()) {
95
+ if (!liveIDs.has(id)) {
96
+ toPrune.push(id);
97
+ }
98
+ }
99
+ // Load existing blocks from the DB (for content_hash vector reuse and pruning).
100
+ const { keys: existingBlockKeys, vecs: existingBlockVecs } = loadExistingBlocks(db);
101
+ // Filter allBlocks to the eligible set.
102
+ const eligibleBlocks = [];
103
+ for (const b of allBlocks) {
104
+ if (blockEligible(b, cfg)) {
105
+ eligibleBlocks.push(b);
106
+ }
107
+ }
108
+ // Embed unique eligible prose blocks (with hash-based vector reuse).
109
+ const blockVecs = await embedBlocks(emb, eligibleBlocks, existingBlockVecs, cfg);
110
+ // Build the live eligible block key set for pruning.
111
+ const liveBlockKeys = new Set();
112
+ for (const b of eligibleBlocks) {
113
+ liveBlockKeys.add(blockKeyOf(b.SectionID, b.Index));
114
+ }
115
+ // Determine block keys to prune (in DB but not in live eligible set).
116
+ const blocksToPrune = [];
117
+ for (const k of existingBlockKeys) {
118
+ if (!liveBlockKeys.has(k)) {
119
+ blocksToPrune.push(k);
120
+ }
121
+ }
122
+ const stats = {
123
+ sections: sections.length,
124
+ embedded: toEmbed.length,
125
+ pruned: toPrune.length,
126
+ };
127
+ // Commit all changes inside one transaction.
128
+ applyChanges(db, sections, embeddings, inbound, toPrune, existing, eligibleBlocks, blockVecs, blocksToPrune, cfg);
129
+ return stats;
130
+ }
131
+ /** resolveBatchSize mirrors Go's batch-size resolution (cfg default 32, capped by MaxBatcher). */
132
+ function resolveBatchSize(emb, cfg) {
133
+ let batchSize = cfg.Embedder.batch_size;
134
+ if (batchSize <= 0) {
135
+ batchSize = 32;
136
+ }
137
+ if (hasMaxBatch(emb)) {
138
+ const n = emb.maxBatch();
139
+ if (n > 0 && n < batchSize) {
140
+ batchSize = n;
141
+ }
142
+ }
143
+ return batchSize;
144
+ }
145
+ /**
146
+ * loadExistingSections reads all section IDs, content_hashes, and embeddings from
147
+ * the DB. Returns a map keyed by section ID. The raw embedding BLOB is decoded
148
+ * here (the SectionDiffRow carries the bytes, not the vector).
149
+ */
150
+ function loadExistingSections(db) {
151
+ const rows = db.querySections();
152
+ const result = new Map();
153
+ for (const r of rows) {
154
+ result.set(r.id, {
155
+ contentHash: r.content_hash,
156
+ embedding: decodeVec(r.embedding),
157
+ });
158
+ }
159
+ return result;
160
+ }
161
+ /**
162
+ * loadExistingBlocks reads all block rows and returns:
163
+ * - a set of existing block keys (section_id, block_index) for pruning, and
164
+ * - a map from content_hash to vector for reuse (NULL-embedding rows are absent
165
+ * from the map — tables are exact-hash only).
166
+ */
167
+ function loadExistingBlocks(db) {
168
+ const rows = db.queryBlocks();
169
+ const keys = new Set();
170
+ const vecs = new Map();
171
+ for (const r of rows) {
172
+ keys.add(blockKeyOf(r.section_id, r.block_index));
173
+ const v = decodeVecOrNull(r.embedding);
174
+ if (v !== null) {
175
+ vecs.set(r.content_hash, v);
176
+ }
177
+ }
178
+ return { keys, vecs };
179
+ }
180
+ /**
181
+ * collectSectionsAndBlocks walks <repoRoot>/docs/ and collects:
182
+ * - eligible sections (sorted by file_path, start_line), and
183
+ * - BlockRecords for every section in every file (including ineligible sections).
184
+ *
185
+ * Each BlockRecord's FilePath is set to the same repo-relative slash path that is
186
+ * applied to Section.file_path.
187
+ */
188
+ export async function collectSectionsAndBlocks(repoRoot, cfg) {
189
+ const docsRoot = path.join(repoRoot, "docs");
190
+ const mdFiles = await walkMarkdown(docsRoot, cfg);
191
+ const allSections = [];
192
+ const allBlocks = [];
193
+ for (const p of mdFiles) {
194
+ let extracted;
195
+ try {
196
+ extracted = extractFromFileWithBlocks(p);
197
+ }
198
+ catch {
199
+ // Non-fatal: skip unparseable/unreadable files (Go logs silently).
200
+ continue;
201
+ }
202
+ const { sections: secs, blocks: blks } = extracted;
203
+ // Make file_path relative to repoRoot, normalised to forward slashes.
204
+ let relPath;
205
+ try {
206
+ relPath = path.relative(repoRoot, p);
207
+ }
208
+ catch {
209
+ relPath = p;
210
+ }
211
+ relPath = relPath.split(path.sep).join("/");
212
+ for (const s of secs) {
213
+ s.file_path = relPath;
214
+ }
215
+ for (const b of blks) {
216
+ b.FilePath = relPath;
217
+ }
218
+ if (secs.length === 0 && blks.length === 0) {
219
+ continue;
220
+ }
221
+ allSections.push(...secs);
222
+ allBlocks.push(...blks);
223
+ }
224
+ // Sort sections by (file_path, start_line) for determinism.
225
+ allSections.sort((a, b) => {
226
+ if (a.file_path < b.file_path) {
227
+ return -1;
228
+ }
229
+ if (a.file_path > b.file_path) {
230
+ return 1;
231
+ }
232
+ return a.start_line - b.start_line;
233
+ });
234
+ return { sections: allSections, blocks: allBlocks };
235
+ }
236
+ /**
237
+ * walkMarkdown returns every .md file under docsRoot, applying the same dir-skip
238
+ * rules as Go's filepath.WalkDir producer: skip directories whose name starts
239
+ * with cfg.Markdown.hidden_dir_prefix or appears in cfg.Markdown.ignored_dirs.
240
+ *
241
+ * The returned order is sorted (directory entries are read, then recursed in name
242
+ * order) — the section list is re-sorted afterward, so this only needs to be a
243
+ * faithful traversal. A walk error (e.g. docsRoot missing) propagates, matching
244
+ * Go where the WalkDir error fails the whole Run.
245
+ */
246
+ async function walkMarkdown(docsRoot, cfg) {
247
+ const out = [];
248
+ async function walk(dir) {
249
+ const entries = await fsp.readdir(dir, { withFileTypes: true });
250
+ // Sort for deterministic traversal (matches WalkDir's lexical order).
251
+ entries.sort((a, b) => (a.name < b.name ? -1 : a.name > b.name ? 1 : 0));
252
+ for (const e of entries) {
253
+ const full = path.join(dir, e.name);
254
+ if (e.isDirectory()) {
255
+ const name = e.name;
256
+ if (cfg.Markdown.hidden_dir_prefix !== "" && name.startsWith(cfg.Markdown.hidden_dir_prefix)) {
257
+ continue;
258
+ }
259
+ if (cfg.Markdown.ignored_dirs.includes(name)) {
260
+ continue;
261
+ }
262
+ await walk(full);
263
+ }
264
+ else if (full.endsWith(".md")) {
265
+ out.push(full);
266
+ }
267
+ }
268
+ }
269
+ await walk(docsRoot);
270
+ return out;
271
+ }
272
+ /**
273
+ * computeInboundCounts builds an in-memory inbound link count map. For each
274
+ * section, scans raw_content for markdown links; resolves relative paths;
275
+ * increments the inbound count of the target section. externalPrefixes is
276
+ * forwarded from cfg.Indexer.external_url_prefixes. The map key is section ID;
277
+ * value is the inbound link count.
278
+ */
279
+ export function computeInboundCounts(sections, externalPrefixes) {
280
+ // Lookup: "<file>\x00<anchor>" → section ID.
281
+ // For no-anchor links (just a file path), use the earliest, lowest-level
282
+ // heading in that file as the target (first per file in sorted order).
283
+ const byAnchor = new Map();
284
+ const byFile = new Map(); // file → first-section ID
285
+ for (const s of sections) {
286
+ byAnchor.set(`${s.file_path}\u0000${s.anchor}`, s.id);
287
+ // Sections are sorted by (file, start_line), so the first one per file wins.
288
+ if (!byFile.has(s.file_path)) {
289
+ byFile.set(s.file_path, s.id);
290
+ }
291
+ }
292
+ const counts = new Map();
293
+ for (const src of sections) {
294
+ const links = parseLinks(src.raw_content, src.file_path, externalPrefixes);
295
+ for (const lnk of links) {
296
+ let targetID;
297
+ if (lnk.anchor !== "") {
298
+ targetID = byAnchor.get(`${lnk.filePath}\u0000${lnk.anchor}`);
299
+ }
300
+ else {
301
+ targetID = byFile.get(lnk.filePath);
302
+ }
303
+ // Skip unresolved links and self-links.
304
+ if (targetID === undefined || targetID === "" || targetID === src.id) {
305
+ continue;
306
+ }
307
+ counts.set(targetID, (counts.get(targetID) ?? 0) + 1);
308
+ }
309
+ }
310
+ return counts;
311
+ }
312
+ /**
313
+ * applyChanges writes all changes inside a single transaction. It upserts all
314
+ * live sections with their embeddings and inbound counts, deletes pruned section
315
+ * IDs, upserts eligible blocks with their embeddings, and prunes stale block keys
316
+ * — all in the same tx.
317
+ */
318
+ function applyChanges(db, sections, embeddings, inbound, toPrune, existing, eligibleBlocks, blockVecs, blocksToPrune, cfg) {
319
+ const now = new Date().toISOString().replace(/\.\d{3}Z$/, "Z");
320
+ db.execTx((conn) => {
321
+ const updateInbound = conn.prepare(`UPDATE sections SET inbound_count=?, updated_at=? WHERE id=?`);
322
+ const upsertSection = conn.prepare(`
323
+ INSERT OR REPLACE INTO sections
324
+ (id, file_path, heading, heading_level, anchor, start_line, end_line,
325
+ content_hash, raw_content, embed_text, prose_word_count,
326
+ has_table, has_code, inbound_count, embedding, updated_at)
327
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)`);
328
+ const deleteSection = conn.prepare(`DELETE FROM sections WHERE id=?`);
329
+ const upsertBlock = conn.prepare(`
330
+ INSERT OR REPLACE INTO blocks
331
+ (section_id, block_index, file_path, heading, kind,
332
+ start_line, end_line, content_hash, embedding)
333
+ VALUES (?,?,?,?,?,?,?,?,?)`);
334
+ const deleteBlock = conn.prepare(`DELETE FROM blocks WHERE section_id=? AND block_index=?`);
335
+ // Upsert all live sections.
336
+ for (const s of sections) {
337
+ const blob = encodeVec(embeddings.get(s.id));
338
+ const inboundCount = inbound.get(s.id) ?? 0;
339
+ // Only upsert if new, changed, or inbound count differs.
340
+ const old = existing.get(s.id);
341
+ if (old !== undefined && old.contentHash === s.content_hash) {
342
+ // Just update inbound_count.
343
+ updateInbound.run(inboundCount, now, s.id);
344
+ continue;
345
+ }
346
+ upsertSection.run(s.id, s.file_path, s.heading, s.heading_level, s.anchor, s.start_line, s.end_line, s.content_hash, s.raw_content, s.embed_text, s.prose_word_count, boolInt(s.has_table), boolInt(s.has_code), inboundCount, blob, now);
347
+ }
348
+ // Prune deleted sections.
349
+ for (const id of toPrune) {
350
+ deleteSection.run(id);
351
+ }
352
+ // Upsert eligible blocks. Embeddable blocks (prose with a non-blacklisted
353
+ // heading) store the vector; tables and heading-blacklisted prose store NULL,
354
+ // which keeps them in the exact-hash pass but out of L5-cosine clustering.
355
+ for (const b of eligibleBlocks) {
356
+ let blob = null;
357
+ if (blockEmbeddable(b, cfg)) {
358
+ blob = encodeVec(blockVecs.get(b.ContentHash));
359
+ }
360
+ upsertBlock.run(b.SectionID, b.Index, b.FilePath, b.Heading, b.Kind, b.StartLine, b.EndLine, b.ContentHash, blob);
361
+ }
362
+ // Prune stale block keys (in DB but absent from the live eligible set).
363
+ for (const k of blocksToPrune) {
364
+ const sep = k.indexOf("\u0000");
365
+ const sid = k.slice(0, sep);
366
+ const idx = Number(k.slice(sep + 1));
367
+ deleteBlock.run(sid, idx);
368
+ }
369
+ });
370
+ }
371
+ /**
372
+ * blockEligible reports whether a block clears the per-block gate.
373
+ *
374
+ * Rules (from the L5 design):
375
+ * - prose: eligible iff word count >= cfg.Block.min_words.
376
+ * - table: eligible iff b.TableRows >= cfg.Block.table_min_rows.
377
+ * - anything else: false.
378
+ *
379
+ * Go used strings.Fields (split on whitespace, drop empty tokens); the equivalent
380
+ * here is trim + split on a whitespace run, dropping the empty-string artifact of
381
+ * a leading/trailing/empty value.
382
+ */
383
+ export function blockEligible(b, cfg) {
384
+ switch (b.Kind) {
385
+ case "prose":
386
+ return countFields(b.Text) >= cfg.Block.min_words;
387
+ case "table":
388
+ return b.TableRows >= cfg.Block.table_min_rows;
389
+ default:
390
+ return false;
391
+ }
392
+ }
393
+ /**
394
+ * countFields counts whitespace-delimited tokens, matching Go's strings.Fields
395
+ * (splits on any unicode whitespace run and discards empty tokens).
396
+ */
397
+ function countFields(s) {
398
+ const trimmed = s.trim();
399
+ if (trimmed === "") {
400
+ return 0;
401
+ }
402
+ return trimmed.split(/\s+/).length;
403
+ }
404
+ /**
405
+ * blockEmbeddable reports whether an eligible block should be embedded (and thus
406
+ * stored with a vector). Only prose blocks are embedded, and a prose block whose
407
+ * owning heading matches the heading blacklist is excluded so it never enters the
408
+ * L5-cosine candidate set — mirroring how tables are exact-hash only.
409
+ *
410
+ * A blacklisted block is still eligible (so it is stored and participates in the
411
+ * exact-hash pass); it just carries a NULL embedding. The gate is applied at
412
+ * embed time (no wasted embedder call) and at store time (so a previously
413
+ * embedded block whose heading is later blacklisted loses its vector on re-index).
414
+ */
415
+ export function blockEmbeddable(b, cfg) {
416
+ return b.Kind === "prose" && !headingBlacklisted(cfg.Analyzer, b.Heading);
417
+ }
418
+ /**
419
+ * embedBlocks embeds unique eligible prose blocks, with ContentHash-based
420
+ * deduplication against the provided existing map.
421
+ *
422
+ * Rules:
423
+ * - Only prose blocks are embedded; table blocks participate via exact-hash
424
+ * only and are absent from the returned map.
425
+ * - A ContentHash already present in existing is reused (not re-embedded).
426
+ * - Within this run, each ContentHash is embedded at most once.
427
+ * - The returned map is keyed by ContentHash → vector and includes both reused
428
+ * (from existing) and newly-embedded prose hashes.
429
+ */
430
+ export async function embedBlocks(emb, eligible, existing, cfg) {
431
+ const batchSize = resolveBatchSize(emb, cfg);
432
+ // Start with all existing entries copied into the result.
433
+ const result = new Map(existing);
434
+ // Collect unique prose hashes not already present, in order (deterministic batching).
435
+ const seen = new Set();
436
+ const toEmbed = [];
437
+ for (const b of eligible) {
438
+ if (!blockEmbeddable(b, cfg)) {
439
+ continue;
440
+ }
441
+ if (result.has(b.ContentHash)) {
442
+ continue; // already in existing
443
+ }
444
+ if (seen.has(b.ContentHash)) {
445
+ continue; // duplicate within this run
446
+ }
447
+ seen.add(b.ContentHash);
448
+ toEmbed.push(b);
449
+ }
450
+ // Batch-embed mirroring the section loop.
451
+ for (let i = 0; i < toEmbed.length; i += batchSize) {
452
+ const end = Math.min(i + batchSize, toEmbed.length);
453
+ const batch = toEmbed.slice(i, end);
454
+ const texts = batch.map((b) => b.Text);
455
+ const vecs = await emb.embed(texts);
456
+ for (let j = 0; j < batch.length; j++) {
457
+ result.set(batch[j].ContentHash, Float32Array.from(vecs[j]));
458
+ }
459
+ }
460
+ return result;
461
+ }
462
+ /**
463
+ * encodeVec encodes a float32 vector as little-endian bytes. An empty/undefined
464
+ * vector returns null (the SQLite NULL BLOB), matching Go's encodeVec returning
465
+ * nil for an empty slice.
466
+ */
467
+ export function encodeVec(v) {
468
+ if (v === undefined || v.length === 0) {
469
+ return null;
470
+ }
471
+ const buf = new Uint8Array(v.length * 4);
472
+ const view = new DataView(buf.buffer);
473
+ for (let i = 0; i < v.length; i++) {
474
+ view.setFloat32(i * 4, v[i], true);
475
+ }
476
+ return buf;
477
+ }
478
+ /**
479
+ * decodeVec decodes a little-endian float32 BLOB to a Float32Array. An empty or
480
+ * NULL blob yields an empty Float32Array (Go returned nil; an empty vector is
481
+ * treated identically by callers).
482
+ */
483
+ export function decodeVec(buf) {
484
+ return decodeVecOrNull(buf) ?? new Float32Array(0);
485
+ }
486
+ /** decodeVecOrNull decodes a BLOB, returning null for an empty/NULL blob. */
487
+ function decodeVecOrNull(buf) {
488
+ if (buf === null || buf.length === 0) {
489
+ return null;
490
+ }
491
+ const v = new Float32Array(Math.floor(buf.length / 4));
492
+ const view = new DataView(buf.buffer, buf.byteOffset, buf.byteLength);
493
+ for (let i = 0; i < v.length; i++) {
494
+ v[i] = view.getFloat32(i * 4, true);
495
+ }
496
+ return v;
497
+ }
498
+ /** boolInt converts a bool to 0 or 1 for SQLite storage. */
499
+ function boolInt(b) {
500
+ return b ? 1 : 0;
501
+ }