docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/README.md +242 -0
  2. package/dist/apispec/apispec.js +401 -0
  3. package/dist/apispec/apispec.test.js +444 -0
  4. package/dist/apispec/errors.js +17 -0
  5. package/dist/apispec/index.js +2 -0
  6. package/dist/check/doclinks.js +167 -0
  7. package/dist/check/index.js +8 -0
  8. package/dist/check/run.js +391 -0
  9. package/dist/check/run.test.js +513 -0
  10. package/dist/check/suggest.js +134 -0
  11. package/dist/check/suggest.test.js +92 -0
  12. package/dist/check/tokens.js +125 -0
  13. package/dist/cmd/main.js +330 -0
  14. package/dist/cmd/main.test.js +422 -0
  15. package/dist/codeq/cache.js +71 -0
  16. package/dist/codeq/cache.test.js +67 -0
  17. package/dist/codeq/errors.js +52 -0
  18. package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
  19. package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
  20. package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
  21. package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
  22. package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
  23. package/dist/codeq/index.js +11 -0
  24. package/dist/codeq/resolve.test.js +109 -0
  25. package/dist/codeq/resolver.js +128 -0
  26. package/dist/codeq/resolver.test.js +124 -0
  27. package/dist/codeq/resolvers/go.js +242 -0
  28. package/dist/codeq/resolvers/go.test.js +143 -0
  29. package/dist/codeq/resolvers/java.js +349 -0
  30. package/dist/codeq/resolvers/java.test.js +138 -0
  31. package/dist/codeq/resolvers/java_queries.js +63 -0
  32. package/dist/codeq/resolvers/javascript.js +412 -0
  33. package/dist/codeq/resolvers/javascript.test.js +125 -0
  34. package/dist/codeq/resolvers/javascript_queries.js +46 -0
  35. package/dist/codeq/resolvers/typescript.js +366 -0
  36. package/dist/codeq/resolvers/typescript.test.js +180 -0
  37. package/dist/codeq/resolvers/typescript_queries.js +78 -0
  38. package/dist/codeq/signature.js +50 -0
  39. package/dist/codeq/signature.test.js +50 -0
  40. package/dist/codeq/suggest.js +96 -0
  41. package/dist/codeq/treesitter.js +122 -0
  42. package/dist/codeq/treesitter.test.js +118 -0
  43. package/dist/config/config.js +74 -0
  44. package/dist/config/config.test.js +98 -0
  45. package/dist/config/fs.js +116 -0
  46. package/dist/config/glob.js +82 -0
  47. package/dist/config/glob.test.js +61 -0
  48. package/dist/config/index.js +4 -0
  49. package/dist/dedup/analyzer/analyzer.js +533 -0
  50. package/dist/dedup/analyzer/analyzer.test.js +530 -0
  51. package/dist/dedup/analyzer/canonical.js +74 -0
  52. package/dist/dedup/analyzer/canonical.test.js +70 -0
  53. package/dist/dedup/analyzer/cosine_clusters.js +169 -0
  54. package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
  55. package/dist/dedup/analyzer/distinctive.js +85 -0
  56. package/dist/dedup/analyzer/distinctive.test.js +49 -0
  57. package/dist/dedup/analyzer/exact_clusters.js +63 -0
  58. package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
  59. package/dist/dedup/analyzer/index.js +14 -0
  60. package/dist/dedup/analyzer/multiplicity.js +110 -0
  61. package/dist/dedup/analyzer/multiplicity.test.js +123 -0
  62. package/dist/dedup/analyzer/order.js +22 -0
  63. package/dist/dedup/analyzer/partial_overlaps.js +65 -0
  64. package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
  65. package/dist/dedup/analyzer/preview.js +84 -0
  66. package/dist/dedup/analyzer/preview.test.js +46 -0
  67. package/dist/dedup/analyzer/safety.js +27 -0
  68. package/dist/dedup/analyzer/safety.test.js +39 -0
  69. package/dist/dedup/config.js +18 -0
  70. package/dist/dedup/configload.js +299 -0
  71. package/dist/dedup/configload.test.js +410 -0
  72. package/dist/dedup/dedup.index.test.js +203 -0
  73. package/dist/dedup/dedup.js +143 -0
  74. package/dist/dedup/dedup.test.js +212 -0
  75. package/dist/dedup/dedupcfg/config.js +112 -0
  76. package/dist/dedup/dedupcfg/config.test.js +70 -0
  77. package/dist/dedup/dedupcfg/index.js +1 -0
  78. package/dist/dedup/deduptypes/index.js +1 -0
  79. package/dist/dedup/deduptypes/types.js +9 -0
  80. package/dist/dedup/deduptypes/types.test.js +34 -0
  81. package/dist/dedup/embedder/cache.js +23 -0
  82. package/dist/dedup/embedder/cache.test.js +50 -0
  83. package/dist/dedup/embedder/constants.js +10 -0
  84. package/dist/dedup/embedder/embedder.js +76 -0
  85. package/dist/dedup/embedder/embedder.mock.test.js +128 -0
  86. package/dist/dedup/embedder/embedder.test.js +96 -0
  87. package/dist/dedup/embedder/errors.js +20 -0
  88. package/dist/dedup/embedder/errors.test.js +35 -0
  89. package/dist/dedup/embedder/index.js +4 -0
  90. package/dist/dedup/embedder/session.js +78 -0
  91. package/dist/dedup/embedder/session.test.js +172 -0
  92. package/dist/dedup/gitignore.js +97 -0
  93. package/dist/dedup/gitignore.test.js +98 -0
  94. package/dist/dedup/index.js +11 -0
  95. package/dist/dedup/indexdb/errors.js +48 -0
  96. package/dist/dedup/indexdb/index.js +6 -0
  97. package/dist/dedup/indexdb/indexdb.js +302 -0
  98. package/dist/dedup/indexdb/indexdb.test.js +739 -0
  99. package/dist/dedup/indexdb/load.js +110 -0
  100. package/dist/dedup/indexdb/migrations.js +58 -0
  101. package/dist/dedup/indexdb/schema.js +83 -0
  102. package/dist/dedup/indexer/index.js +9 -0
  103. package/dist/dedup/indexer/indexer.js +501 -0
  104. package/dist/dedup/indexer/indexer.test.js +510 -0
  105. package/dist/dedup/indexer/links.js +89 -0
  106. package/dist/dedup/mdsection/anchor.js +60 -0
  107. package/dist/dedup/mdsection/anchor.test.js +39 -0
  108. package/dist/dedup/mdsection/blocks.js +409 -0
  109. package/dist/dedup/mdsection/blocks.test.js +359 -0
  110. package/dist/dedup/mdsection/index.js +4 -0
  111. package/dist/dedup/mdsection/parse.js +21 -0
  112. package/dist/dedup/mdsection/section.js +234 -0
  113. package/dist/dedup/mdsection/section.test.js +221 -0
  114. package/dist/dedup/report/floatfmt.js +71 -0
  115. package/dist/dedup/report/floatfmt.test.js +42 -0
  116. package/dist/dedup/report/index.js +8 -0
  117. package/dist/dedup/report/quote.js +77 -0
  118. package/dist/dedup/report/quote.test.js +67 -0
  119. package/dist/dedup/report/text.js +251 -0
  120. package/dist/dedup/report/text.test.js +420 -0
  121. package/dist/dedup/report_types.js +8 -0
  122. package/dist/dedup/sectionid/index.js +1 -0
  123. package/dist/dedup/sectionid/sectionid.js +16 -0
  124. package/dist/dedup/sectionid/sectionid.test.js +49 -0
  125. package/dist/guard/api/errors.js +12 -0
  126. package/dist/guard/api/index.js +2 -0
  127. package/dist/guard/api/parser.js +81 -0
  128. package/dist/guard/api/parser.test.js +58 -0
  129. package/dist/guard/api/types.js +1 -0
  130. package/dist/guard/code/errors.js +16 -0
  131. package/dist/guard/code/index.js +2 -0
  132. package/dist/guard/code/parser.js +54 -0
  133. package/dist/guard/code/parser.test.js +111 -0
  134. package/dist/guard/code/types.js +6 -0
  135. package/dist/index.js +1 -0
  136. package/dist/index.test.js +5 -0
  137. package/dist/repo/boundary.js +92 -0
  138. package/dist/repo/boundary.test.js +65 -0
  139. package/dist/repo/errors.js +56 -0
  140. package/dist/repo/errors.test.js +85 -0
  141. package/dist/repo/exists.test.js +72 -0
  142. package/dist/repo/filename.js +46 -0
  143. package/dist/repo/filename.test.js +39 -0
  144. package/dist/repo/fs.js +53 -0
  145. package/dist/repo/index.js +7 -0
  146. package/dist/repo/overlay.js +36 -0
  147. package/dist/repo/overlay.test.js +80 -0
  148. package/dist/repo/repo.js +353 -0
  149. package/dist/repo/repo.test.js +255 -0
  150. package/dist/repo/testutil.js +27 -0
  151. package/dist/repo/write.test.js +125 -0
  152. package/dist/report/color.js +73 -0
  153. package/dist/report/index.js +1 -0
  154. package/dist/report/report.js +112 -0
  155. package/dist/report/report.test.js +368 -0
  156. package/dist/violation/index.js +1 -0
  157. package/dist/violation/types.js +22 -0
  158. package/dist/violation/types.test.js +70 -0
  159. package/package.json +48 -0
@@ -0,0 +1,510 @@
1
+ /**
2
+ * Behavior-encoding tests for the dedup indexing pipeline, ported from
3
+ * internal/dedup/indexer/indexer_test.go.
4
+ *
5
+ * Each test states WHY the behavior matters. The indexer is the incremental
6
+ * cache: it must (a) persist eligible sections/blocks, (b) re-embed ONLY the
7
+ * sections/blocks whose content hash changed — the cache's whole reason to exist,
8
+ * (c) populate inbound_count from intra-corpus links, (d) prune rows for deleted
9
+ * files, and (e) roll the whole run back on any failure so a crash never leaves a
10
+ * half-written index.
11
+ *
12
+ * The embedder is injected as a deterministic fake so tests never download the
13
+ * ~1GB model — this mirrors Go, which mocks the embedder at the indexer level.
14
+ * The DB is a real node:sqlite index opened at a temp path (the same engine the
15
+ * package uses), and raw queries read it back to assert on persisted state.
16
+ */
17
+ import { mkdtempSync, rmSync, writeFileSync, mkdirSync, unlinkSync } from "node:fs";
18
+ import { tmpdir } from "node:os";
19
+ import { dirname, join } from "node:path";
20
+ import { DatabaseSync } from "node:sqlite";
21
+ import { afterEach, describe, expect, it } from "vitest";
22
+ import { Dimension, Model } from "../embedder/index.js";
23
+ import { open } from "../indexdb/index.js";
24
+ import { defaultConfig } from "../dedupcfg/index.js";
25
+ import { blockEligible, blockEmbeddable, collectSectionsAndBlocks, embedBlocks, parseLinks, run, } from "./index.js";
26
+ // --- fake embedder ---------------------------------------------------------
27
+ /**
28
+ * fakeEmbedder returns deterministic unit vectors and records how many embed
29
+ * calls were made and what texts were embedded. errOn forces the next embed call
30
+ * to throw (for the rollback test).
31
+ */
32
+ class FakeEmbedder {
33
+ calls = 0;
34
+ batches = [];
35
+ errOn = null;
36
+ async embed(texts) {
37
+ if (this.errOn !== null) {
38
+ const err = this.errOn;
39
+ this.errOn = null;
40
+ throw err;
41
+ }
42
+ this.calls++;
43
+ this.batches.push([...texts]);
44
+ return texts.map((_, i) => unitVec(i, Dimension));
45
+ }
46
+ name() {
47
+ return Model;
48
+ }
49
+ dimension() {
50
+ return Dimension;
51
+ }
52
+ totalTexts() {
53
+ return this.batches.reduce((n, b) => n + b.length, 0);
54
+ }
55
+ }
56
+ /** unitVec produces a deterministic vector with a 1.0 at position (seed % dim). */
57
+ function unitVec(seed, dim) {
58
+ const v = new Array(dim).fill(0);
59
+ v[seed % dim] = 1.0;
60
+ return v;
61
+ }
62
+ // --- temp dir / DB plumbing ------------------------------------------------
63
+ const tmpDirs = [];
64
+ const openStores = [];
65
+ afterEach(() => {
66
+ for (const s of openStores.splice(0)) {
67
+ try {
68
+ s.close();
69
+ }
70
+ catch {
71
+ /* already closed */
72
+ }
73
+ }
74
+ for (const d of tmpDirs.splice(0)) {
75
+ rmSync(d, { recursive: true, force: true });
76
+ }
77
+ });
78
+ /** openTestDB opens a fresh indexdb at a temp path; returns the store + path. */
79
+ function openTestDB() {
80
+ const dir = mkdtempSync(join(tmpdir(), "indexer-db-"));
81
+ tmpDirs.push(dir);
82
+ const dbPath = join(dir, "index.db");
83
+ const { store } = open(dbPath, Model, Dimension);
84
+ openStores.push(store);
85
+ return { store, dbPath };
86
+ }
87
+ /** makeTestRepo writes the given files under a fresh temp root; returns repoRoot. */
88
+ function makeTestRepo(files) {
89
+ const root = mkdtempSync(join(tmpdir(), "indexer-repo-"));
90
+ tmpDirs.push(root);
91
+ for (const [relPath, content] of Object.entries(files)) {
92
+ const full = join(root, relPath);
93
+ mkdirSync(dirname(full), { recursive: true });
94
+ writeFileSync(full, content);
95
+ }
96
+ return root;
97
+ }
98
+ const noProgress = () => { };
99
+ // --- raw-DB read helpers (assert on persisted state) -----------------------
100
+ function rawDB(dbPath) {
101
+ return new DatabaseSync(dbPath);
102
+ }
103
+ function countSectionsInDB(dbPath) {
104
+ const db = rawDB(dbPath);
105
+ try {
106
+ const row = db.prepare(`SELECT COUNT(*) AS n FROM sections`).get();
107
+ return row.n;
108
+ }
109
+ finally {
110
+ db.close();
111
+ }
112
+ }
113
+ function sectionHashesFromDB(dbPath) {
114
+ const db = rawDB(dbPath);
115
+ try {
116
+ const rows = db
117
+ .prepare(`SELECT content_hash FROM sections ORDER BY file_path, start_line`)
118
+ .all();
119
+ return rows.map((r) => r.content_hash);
120
+ }
121
+ finally {
122
+ db.close();
123
+ }
124
+ }
125
+ function countBlocksInDB(dbPath) {
126
+ const db = rawDB(dbPath);
127
+ try {
128
+ const row = db.prepare(`SELECT COUNT(*) AS n FROM blocks`).get();
129
+ return row.n;
130
+ }
131
+ finally {
132
+ db.close();
133
+ }
134
+ }
135
+ function blockFilePathsFromDB(dbPath) {
136
+ const db = rawDB(dbPath);
137
+ try {
138
+ const rows = db
139
+ .prepare(`SELECT DISTINCT file_path FROM blocks ORDER BY file_path`)
140
+ .all();
141
+ return rows.map((r) => r.file_path);
142
+ }
143
+ finally {
144
+ db.close();
145
+ }
146
+ }
147
+ function blockHashesFromDB(dbPath, filePath) {
148
+ const db = rawDB(dbPath);
149
+ try {
150
+ const rows = db
151
+ .prepare(`SELECT section_id, block_index, content_hash FROM blocks WHERE file_path=?`)
152
+ .all(filePath);
153
+ const out = new Map();
154
+ for (const r of rows) {
155
+ out.set(`${r.section_id}|${r.block_index}`, r.content_hash);
156
+ }
157
+ return out;
158
+ }
159
+ finally {
160
+ db.close();
161
+ }
162
+ }
163
+ function blockEmbedNullByHeadingFromDB(dbPath) {
164
+ const db = rawDB(dbPath);
165
+ try {
166
+ const rows = db
167
+ .prepare(`SELECT heading, (embedding IS NULL) AS is_null FROM blocks`)
168
+ .all();
169
+ const out = new Map();
170
+ for (const r of rows) {
171
+ out.set(r.heading, r.is_null !== 0);
172
+ }
173
+ return out;
174
+ }
175
+ finally {
176
+ db.close();
177
+ }
178
+ }
179
+ // --- tests -----------------------------------------------------------------
180
+ describe("indexer.run", () => {
181
+ // WHY: the index is useless if a basic corpus produces no persisted sections.
182
+ it("indexes a simple corpus and persists sections", async () => {
183
+ const repoRoot = makeTestRepo({
184
+ "docs/guide.md": "## Introduction\n\nThis is the introduction section of the guide. It covers the basic concepts and provides an overview of the system. Users should read this first.\n",
185
+ });
186
+ const { store, dbPath } = openTestDB();
187
+ const emb = new FakeEmbedder();
188
+ const stats = await run(store, emb, repoRoot, defaultConfig(), noProgress);
189
+ expect(stats.sections).toBeGreaterThan(0);
190
+ expect(countSectionsInDB(dbPath)).toBeGreaterThan(0);
191
+ });
192
+ // WHY: incremental caching is the package's reason to exist — re-indexing an
193
+ // unchanged corpus must do zero embedder work.
194
+ it("is idempotent: a second run on an unchanged corpus makes 0 new embed calls", async () => {
195
+ const repoRoot = makeTestRepo({
196
+ "docs/guide.md": "## Introduction\n\nThis is the introduction section of the guide. It covers the basic concepts and provides an overview of the system. Users should read this first.\n\n## Setup\n\nThe setup section explains installation and configuration. Follow the steps below to get started with the tool. Make sure you have all prerequisites installed.\n",
197
+ });
198
+ const { store } = openTestDB();
199
+ const emb = new FakeEmbedder();
200
+ await run(store, emb, repoRoot, defaultConfig(), noProgress);
201
+ const callsAfterFirst = emb.calls;
202
+ await run(store, emb, repoRoot, defaultConfig(), noProgress);
203
+ expect(emb.calls).toBe(callsAfterFirst);
204
+ });
205
+ // WHY: only the changed section (and its changed block) may be re-embedded;
206
+ // unchanged sections must reuse their cached vectors. A changed section
207
+ // re-embeds the section itself AND its one prose block = 2 texts; the unchanged
208
+ // section contributes none.
209
+ it("re-embeds only the changed section and its block", async () => {
210
+ const sectionA = "## Introduction\n\nThis is the introduction section of the guide. It covers the basic concepts and provides an overview of the system. Users should read this first.\n\n";
211
+ const sectionB = "## Setup\n\nThe setup section explains installation and configuration. Follow the steps below to get started with the tool. Make sure you have all prerequisites installed.\n";
212
+ const repoRoot = makeTestRepo({ "docs/guide.md": sectionA + sectionB });
213
+ const { store } = openTestDB();
214
+ const emb = new FakeEmbedder();
215
+ await run(store, emb, repoRoot, defaultConfig(), noProgress);
216
+ const textsAfterFirst = emb.totalTexts();
217
+ const modified = "## Introduction\n\nThis introduction has been updated with new content. It covers advanced concepts. Users should read this thoroughly.\n\n" +
218
+ sectionB;
219
+ writeFileSync(join(repoRoot, "docs/guide.md"), modified);
220
+ await run(store, emb, repoRoot, defaultConfig(), noProgress);
221
+ const newTexts = emb.totalTexts() - textsAfterFirst;
222
+ expect(newTexts).toBe(2);
223
+ });
224
+ // WHY: parity with the Python POC requires sections written in (file_path,
225
+ // start_line) order regardless of filesystem walk order.
226
+ it("writes sections in deterministic (file_path, start_line) order", async () => {
227
+ const repoRoot = makeTestRepo({
228
+ "docs/z_last.md": "## Zebra Section\n\nThis section comes last in lexicographic order. It has sufficient prose words to be eligible for indexing. Multiple sentences ensure the minimum word count is met.\n",
229
+ "docs/a_first.md": "## Apple Section\n\nThis section comes first in lexicographic order. It has sufficient prose words to be eligible for indexing. Multiple sentences ensure the minimum word count is met.\n",
230
+ });
231
+ const { store, dbPath } = openTestDB();
232
+ const emb = new FakeEmbedder();
233
+ await run(store, emb, repoRoot, defaultConfig(), noProgress);
234
+ const db = rawDB(dbPath);
235
+ let paths;
236
+ try {
237
+ paths = db.prepare(`SELECT file_path FROM sections ORDER BY file_path, start_line`).all().map((r) => r.file_path);
238
+ }
239
+ finally {
240
+ db.close();
241
+ }
242
+ const sorted = [...paths].sort();
243
+ expect(paths).toEqual(sorted);
244
+ });
245
+ // WHY: a section whose file is deleted must be removed from the DB, or the
246
+ // index would accumulate dead rows that pollute clustering.
247
+ it("prunes sections whose file was removed", async () => {
248
+ const repoRoot = makeTestRepo({
249
+ "docs/guide.md": "## Introduction\n\nThis is the introduction section. It explains the basic concepts and overview of the system. This is important to read first.\n",
250
+ "docs/extra.md": "## Extra Content\n\nThis extra file will be removed in the next run. It contains important information about the additional features. Please read it carefully.\n",
251
+ });
252
+ const { store, dbPath } = openTestDB();
253
+ const emb = new FakeEmbedder();
254
+ await run(store, emb, repoRoot, defaultConfig(), noProgress);
255
+ const countAfterFirst = countSectionsInDB(dbPath);
256
+ expect(countAfterFirst).toBeGreaterThanOrEqual(2);
257
+ unlinkSync(join(repoRoot, "docs/extra.md"));
258
+ await run(store, emb, repoRoot, defaultConfig(), noProgress);
259
+ const countAfterSecond = countSectionsInDB(dbPath);
260
+ expect(countAfterSecond).toBeLessThan(countAfterFirst);
261
+ });
262
+ // WHY: the whole run is one transaction — an embedder failure must leave the
263
+ // on-disk DB exactly as it was, never half-written. We first index a corpus
264
+ // successfully, then make the SECOND run (after a content edit) fail at embed
265
+ // time; the persisted hashes must be byte-identical to the pre-failure state,
266
+ // proving the edited row was never committed.
267
+ it("rolls back the transaction when the embedder fails", async () => {
268
+ const repoRoot = makeTestRepo({
269
+ "docs/guide.md": "## Introduction\n\nThis is the original introduction with plenty of words to be eligible for indexing and embedding into the dedup pool.\n",
270
+ });
271
+ const { store, dbPath } = openTestDB();
272
+ const emb = new FakeEmbedder();
273
+ // First run succeeds and persists state.
274
+ await run(store, emb, repoRoot, defaultConfig(), noProgress);
275
+ const hashBefore = sectionHashesFromDB(dbPath);
276
+ expect(hashBefore.length).toBeGreaterThan(0);
277
+ // Edit the content so the second run must re-embed, then force the embed to fail.
278
+ writeFileSync(join(repoRoot, "docs/guide.md"), "## Introduction\n\nThis introduction was edited and must trigger a fresh embedding call which is forced to fail for the rollback test.\n");
279
+ emb.errOn = new Error("fake embedder: forced error for rollback test");
280
+ await expect(run(store, emb, repoRoot, defaultConfig(), noProgress)).rejects.toThrow();
281
+ // DB unchanged: same hashes as before the failed run (edit was rolled back).
282
+ const hashAfter = sectionHashesFromDB(dbPath);
283
+ expect(hashAfter).toEqual(hashBefore);
284
+ });
285
+ // WHY: inbound_count drives canonical selection in the analyzer; Pass 2 must
286
+ // resolve an intra-corpus link to the target section and increment its count.
287
+ it("populates inbound_count from intra-corpus links", async () => {
288
+ const repoRoot = makeTestRepo({
289
+ "docs/target.md": "## Target Section\n\nThis section is linked to by another document. It provides important information about a key concept. Many other sections reference this one.\n",
290
+ "docs/source.md": "## Source Section\n\nThis section links to the [Target Section](target.md#target-section). It references important information from the target document. Please see the link for details.\n",
291
+ });
292
+ const { store, dbPath } = openTestDB();
293
+ const emb = new FakeEmbedder();
294
+ await run(store, emb, repoRoot, defaultConfig(), noProgress);
295
+ const db = rawDB(dbPath);
296
+ let maxInbound;
297
+ try {
298
+ const row = db.prepare(`SELECT MAX(inbound_count) AS m FROM sections`).get();
299
+ maxInbound = row.m ?? 0;
300
+ }
301
+ finally {
302
+ db.close();
303
+ }
304
+ expect(maxInbound).toBeGreaterThan(0);
305
+ });
306
+ });
307
+ describe("indexer.parseLinks", () => {
308
+ // WHY: cfg.Indexer.external_url_prefixes must actually drive external-link
309
+ // filtering — observable only when the caller passes a non-default prefix list.
310
+ it("filters external links using the supplied prefix list, not a hardcoded one", () => {
311
+ const raw = "See [the docs](http://example.com/target.md) for more info.";
312
+ const srcFile = "docs/source.md";
313
+ // Empty prefix list → http:// is treated as internal (not filtered).
314
+ const withEmpty = parseLinks(raw, srcFile, []);
315
+ expect(withEmpty.length).toBeGreaterThan(0);
316
+ // Default prefix list → http:// is external → filtered out.
317
+ const withHTTP = parseLinks(raw, srcFile, ["http://", "https://", "mailto:"]);
318
+ expect(withHTTP.length).toBe(0);
319
+ });
320
+ });
321
+ describe("indexer block gates and reuse", () => {
322
+ // WHY: L5 partial-duplication must (a) exclude sub-floor blocks from embedding
323
+ // and (b) embed each unique ContentHash at most once per run, reusing vectors
324
+ // for identical blocks so verbatim copies share one stored vector with no extra
325
+ // embedder calls.
326
+ it("gates ineligible blocks and dedups embeds by ContentHash", async () => {
327
+ const cfg = defaultConfig(); // Block.min_words == 10, table_min_rows == 2.
328
+ const mk = (over) => ({
329
+ SectionID: "",
330
+ FilePath: "",
331
+ Heading: "",
332
+ Index: 0,
333
+ Kind: "prose",
334
+ StartLine: 0,
335
+ EndLine: 0,
336
+ ContentHash: "",
337
+ Text: "",
338
+ TableRows: 0,
339
+ ...over,
340
+ });
341
+ const subFloor = mk({ Kind: "prose", ContentHash: "sub-floor-hash", Text: "one two three" });
342
+ const oneRowTable = mk({
343
+ Kind: "table",
344
+ ContentHash: "one-row-table-hash",
345
+ Text: "col=val",
346
+ TableRows: 1,
347
+ });
348
+ const eligibleProse = mk({
349
+ Kind: "prose",
350
+ ContentHash: "eligible-hash-A",
351
+ Text: "the quick brown fox jumps over the lazy dog again",
352
+ });
353
+ const sharedHash = "shared-hash-XY";
354
+ const sharedText = "this is a long enough prose block that qualifies for embedding";
355
+ const identicalA = mk({ Kind: "prose", ContentHash: sharedHash, Text: sharedText });
356
+ const identicalB = mk({ Kind: "prose", ContentHash: sharedHash, Text: sharedText });
357
+ // --- blockEligible gate ---
358
+ expect(blockEligible(subFloor, cfg)).toBe(false);
359
+ expect(blockEligible(oneRowTable, cfg)).toBe(false);
360
+ expect(blockEligible(eligibleProse, cfg)).toBe(true);
361
+ expect(blockEligible(identicalA, cfg)).toBe(true);
362
+ expect(blockEligible(identicalB, cfg)).toBe(true);
363
+ const all = [subFloor, oneRowTable, eligibleProse, identicalA, identicalB];
364
+ const eligible = all.filter((b) => blockEligible(b, cfg));
365
+ expect(eligible.length).toBe(3);
366
+ // --- embedBlocks: hash-based dedup + reuse ---
367
+ const existingVec = Float32Array.from([0.1, 0.2, 0.3]);
368
+ const existing = new Map([[eligibleProse.ContentHash, existingVec]]);
369
+ const emb = new FakeEmbedder();
370
+ const result = await embedBlocks(emb, eligible, existing, cfg);
371
+ // Existing hash present (reused, not re-embedded).
372
+ expect(result.has(eligibleProse.ContentHash)).toBe(true);
373
+ // Only one unique text embedded (sharedHash, shared by identicalA/B).
374
+ expect(emb.totalTexts()).toBe(1);
375
+ // The shared hash is present.
376
+ expect(result.has(sharedHash)).toBe(true);
377
+ // Table hash must NOT be in the result (tables are not embedded).
378
+ expect(result.has(oneRowTable.ContentHash)).toBe(false);
379
+ // The reused existing vector is preserved exactly.
380
+ expect(Array.from(result.get(eligibleProse.ContentHash))).toEqual(Array.from(existingVec));
381
+ });
382
+ // WHY: blockEmbeddable is the gate that decides whether a block gets a stored
383
+ // vector. A heading-blacklisted prose block must be excluded from L5-cosine
384
+ // clustering (NULL embedding); tables are never embeddable.
385
+ it("blockEmbeddable excludes blacklisted-heading prose and all tables", () => {
386
+ const cfg = defaultConfig();
387
+ cfg.Analyzer.heading_blacklist = [...cfg.Analyzer.heading_blacklist, "steps"];
388
+ const blacklisted = {
389
+ SectionID: "",
390
+ FilePath: "",
391
+ Heading: "Steps",
392
+ Index: 0,
393
+ Kind: "prose",
394
+ StartLine: 0,
395
+ EndLine: 0,
396
+ ContentHash: "h1",
397
+ Text: "",
398
+ TableRows: 0,
399
+ };
400
+ const normal = { ...blacklisted, Heading: "Overview", ContentHash: "h2" };
401
+ const table = { ...blacklisted, Heading: "Overview", Kind: "table", ContentHash: "h3" };
402
+ expect(blockEmbeddable(blacklisted, cfg)).toBe(false);
403
+ expect(blockEmbeddable(normal, cfg)).toBe(true);
404
+ expect(blockEmbeddable(table, cfg)).toBe(false);
405
+ });
406
+ });
407
+ describe("indexer block persistence", () => {
408
+ // WHY: applyChanges must persist eligible blocks in the same tx as sections and
409
+ // prune any (section_id, block_index) absent from the live eligible set. A
410
+ // changed block triggers one new embed; an unchanged block triggers none.
411
+ it("writes, prunes, and incrementally re-embeds blocks", async () => {
412
+ const proseA1 = "This is the first paragraph of file alpha with plenty of words to qualify.";
413
+ const proseA2 = "Here is another paragraph in file alpha containing more than ten words easily.";
414
+ const proseB1 = "This is the primary content of file beta with sufficient words to be eligible.";
415
+ const fileAlpha = "## Alpha Section\n\n" + proseA1 + "\n\n" + proseA2 + "\n";
416
+ const fileBeta = "## Beta Section\n\n" + proseB1 + "\n";
417
+ const repoRoot = makeTestRepo({ "docs/alpha.md": fileAlpha, "docs/beta.md": fileBeta });
418
+ const { store, dbPath } = openTestDB();
419
+ const emb = new FakeEmbedder();
420
+ const cfg = defaultConfig();
421
+ // (a) Initial index: blocks persisted for both files.
422
+ await run(store, emb, repoRoot, cfg, noProgress);
423
+ const blocksAfterFirst = countBlocksInDB(dbPath);
424
+ expect(blocksAfterFirst).toBeGreaterThan(0);
425
+ expect(blockFilePathsFromDB(dbPath).length).toBe(2);
426
+ // (b) Delete beta.md: its blocks pruned; alpha's remain.
427
+ unlinkSync(join(repoRoot, "docs/beta.md"));
428
+ await run(store, emb, repoRoot, cfg, noProgress);
429
+ const filesAfterDelete = blockFilePathsFromDB(dbPath);
430
+ expect(filesAfterDelete).toEqual(["docs/alpha.md"]);
431
+ expect(countBlocksInDB(dbPath)).toBeLessThan(blocksAfterFirst);
432
+ // (c) Edit one block in alpha.md: changed block gets a new content_hash; the
433
+ // unchanged block keeps its hash.
434
+ const hashesBefore = blockHashesFromDB(dbPath, "docs/alpha.md");
435
+ expect(hashesBefore.size).toBeGreaterThanOrEqual(2);
436
+ const proseA1edited = "This paragraph has been edited so its content hash now differs from the original.";
437
+ const fileAlphaEdited = "## Alpha Section\n\n" + proseA1edited + "\n\n" + proseA2 + "\n";
438
+ writeFileSync(join(repoRoot, "docs/alpha.md"), fileAlphaEdited);
439
+ await run(store, emb, repoRoot, cfg, noProgress);
440
+ const hashesAfter = blockHashesFromDB(dbPath, "docs/alpha.md");
441
+ let unchanged = 0;
442
+ let changed = 0;
443
+ for (const [key, before] of hashesBefore) {
444
+ const after = hashesAfter.get(key);
445
+ if (after !== undefined) {
446
+ if (before === after) {
447
+ unchanged++;
448
+ }
449
+ else {
450
+ changed++;
451
+ }
452
+ }
453
+ }
454
+ expect(unchanged).toBeGreaterThanOrEqual(1);
455
+ expect(changed).toBeGreaterThanOrEqual(1);
456
+ // (d) Idempotent: re-index with no changes — zero embed calls.
457
+ emb.calls = 0;
458
+ emb.batches = [];
459
+ await run(store, emb, repoRoot, cfg, noProgress);
460
+ expect(emb.calls).toBe(0);
461
+ });
462
+ // WHY: the heading blacklist must exclude blocks from L5-cosine clustering even
463
+ // on re-index — a block embedded before its heading was blacklisted must lose
464
+ // its stored vector on the next run (stays stored for exact-hash, NULL vector).
465
+ it("nulls a block's embedding when its heading becomes blacklisted on re-index", async () => {
466
+ const overviewProse = "This overview paragraph has well over ten words so it clears the block word floor.";
467
+ const stepsProse = "These steps describe the procedure in more than ten words to clear the floor.";
468
+ const guide = "## Overview\n\n" + overviewProse + "\n\n## Steps\n\n" + stepsProse + "\n";
469
+ const repoRoot = makeTestRepo({ "docs/guide.md": guide });
470
+ const { store, dbPath } = openTestDB();
471
+ const emb = new FakeEmbedder();
472
+ // Run 1: default config — both blocks get a vector.
473
+ const cfg = defaultConfig();
474
+ await run(store, emb, repoRoot, cfg, noProgress);
475
+ let nulls = blockEmbedNullByHeadingFromDB(dbPath);
476
+ expect(nulls.get("Steps")).toBe(false);
477
+ expect(nulls.get("Overview")).toBe(false);
478
+ // Run 2: blacklist "steps" and re-index — Steps loses its vector (NULL) but
479
+ // remains stored; Overview intact.
480
+ cfg.Analyzer.heading_blacklist = [...cfg.Analyzer.heading_blacklist, "steps"];
481
+ await run(store, emb, repoRoot, cfg, noProgress);
482
+ nulls = blockEmbedNullByHeadingFromDB(dbPath);
483
+ expect(nulls.has("Steps")).toBe(true);
484
+ expect(nulls.get("Steps")).toBe(true);
485
+ expect(nulls.get("Overview")).toBe(false);
486
+ });
487
+ });
488
+ describe("indexer.collectSectionsAndBlocks", () => {
489
+ // WHY: L5 needs blocks from ALL sections — including table-only
490
+ // (section-ineligible) sections — and every block's FilePath must be the
491
+ // repo-relative slash-normalized path.
492
+ it("collects blocks from every file, including table-only sections, with repo-relative slash paths", async () => {
493
+ const repoRoot = makeTestRepo({
494
+ "docs/prose.md": "## Introduction\n\nThis section has enough prose words to be eligible for indexing. It covers several important topics and provides good context.\n",
495
+ "docs/table_only.md": "## Comparison\n\n| Name | Value |\n|------|-------|\n| A | 1 |\n| B | 2 |\n",
496
+ });
497
+ const cfg = defaultConfig();
498
+ const { blocks } = await collectSectionsAndBlocks(repoRoot, cfg);
499
+ expect(blocks.length).toBeGreaterThan(0);
500
+ for (const b of blocks) {
501
+ // Repo-relative slash path: no drive/absolute prefix, no backslash, docs/ prefix.
502
+ expect(b.FilePath.startsWith("/")).toBe(false);
503
+ expect(b.FilePath.includes("\\")).toBe(false);
504
+ expect(b.FilePath.startsWith("docs/")).toBe(true);
505
+ }
506
+ // The table block from the table-only (section-ineligible) file must be present.
507
+ const tableFound = blocks.some((b) => b.FilePath === "docs/table_only.md" && b.Kind === "table");
508
+ expect(tableFound).toBe(true);
509
+ });
510
+ });
@@ -0,0 +1,89 @@
1
+ /**
2
+ * Inbound-link resolution for the indexer's Pass 2.
3
+ *
4
+ * Ported from internal/dedup/indexer/links.go. parseLinks extracts internal
5
+ * markdown link targets from a section's raw_content; the indexer resolves each
6
+ * target to a section ID and increments that section's inbound_count.
7
+ *
8
+ * Go used path.Dir/path.Clean/path.Join (the slash-only "path" package, not
9
+ * filepath) so resolution is OS-independent; the Node "node:path/posix" module is
10
+ * the exact analogue and keeps forward-slash semantics on every platform.
11
+ */
12
+ import * as posix from "node:path/posix";
13
+ /**
14
+ * linkRe matches markdown links: [text](url), excluding image links (!).
15
+ * Locked regex per plan §Phase2 "Link resolution".
16
+ *
17
+ * Go's regexp.FindAllStringSubmatch finds all non-overlapping matches. The
18
+ * leading `(?:^|[^!])` consumes one character before the `[`, which can cause
19
+ * adjacent matches to be skipped in Go too (the consumed char is not re-scanned);
20
+ * the global JS regex below has the identical non-overlapping behaviour because
21
+ * lastIndex advances past the whole match. The `g` flag is required for
22
+ * matchAll.
23
+ */
24
+ const linkRe = /(?:^|[^!])\[(?<text>[^\]]+)\]\((?<url>[^)\s]+)\)/g;
25
+ /**
26
+ * parseLinks extracts all internal link targets from rawContent.
27
+ * filePath is the file that contains the content (for relative resolution).
28
+ * externalPrefixes is the list of URL scheme prefixes treated as external;
29
+ * links matching any prefix are excluded from the result.
30
+ * Each returned link is a pair of (resolvedFilePath, anchor).
31
+ * Self-links (target == source file + same anchor) are not filtered here —
32
+ * the caller handles self-link exclusion.
33
+ */
34
+ export function parseLinks(rawContent, filePath, externalPrefixes) {
35
+ const targets = [];
36
+ for (const m of rawContent.matchAll(linkRe)) {
37
+ const url = m.groups?.url;
38
+ if (url === undefined) {
39
+ continue;
40
+ }
41
+ if (isExternal(url, externalPrefixes)) {
42
+ continue;
43
+ }
44
+ const { filePath: resolved, anchor } = resolveLink(url, filePath);
45
+ targets.push({ filePath: resolved, anchor });
46
+ }
47
+ return targets;
48
+ }
49
+ /**
50
+ * isExternal returns true if url starts with any of the supplied external
51
+ * prefixes (from cfg.Indexer.external_url_prefixes).
52
+ */
53
+ export function isExternal(url, externalPrefixes) {
54
+ for (const prefix of externalPrefixes) {
55
+ if (url.startsWith(prefix)) {
56
+ return true;
57
+ }
58
+ }
59
+ return false;
60
+ }
61
+ /**
62
+ * resolveLink resolves a relative markdown URL against the source file path.
63
+ * Returns { filePath, anchor }.
64
+ *
65
+ * Examples:
66
+ *
67
+ * resolveLink("../concepts/overview.md#intro", "docs/guide/setup.md")
68
+ * → { filePath: "docs/concepts/overview.md", anchor: "intro" }
69
+ *
70
+ * resolveLink("target.md", "docs/source.md")
71
+ * → { filePath: "docs/target.md", anchor: "" }
72
+ */
73
+ export function resolveLink(url, sourceFile) {
74
+ let anchor = "";
75
+ // Split anchor (Go: strings.LastIndex(url, "#")).
76
+ const idx = url.lastIndexOf("#");
77
+ if (idx >= 0) {
78
+ anchor = url.slice(idx + 1);
79
+ url = url.slice(0, idx);
80
+ }
81
+ // If url is empty (anchor-only link), it points to the same file.
82
+ if (url === "") {
83
+ return { filePath: sourceFile, anchor };
84
+ }
85
+ // Resolve relative to source file directory (slash-only, like Go's "path").
86
+ const sourceDir = posix.dirname(sourceFile);
87
+ const resolved = posix.normalize(posix.join(sourceDir, url));
88
+ return { filePath: resolved, anchor };
89
+ }
@@ -0,0 +1,60 @@
1
+ /**
2
+ * GitHub-style anchor slug generation and per-file collision tracking.
3
+ *
4
+ * Ported from internal/dedup/mdsection/anchor.go. The slugification rules are
5
+ * LOCKED: these anchors feed sectionid.derive and persist in the dedup index,
6
+ * so they must match the Go implementation character-for-character.
7
+ */
8
+ /**
9
+ * Strips characters that are not Unicode letters, Unicode digits, hyphens,
10
+ * spaces, or underscores from heading text to form a GitHub-style anchor.
11
+ *
12
+ * Go source: `regexp.MustCompile("[^\\p{L}\\p{N}\\- _]+")`.
13
+ * The `u` flag makes `\p{L}`/`\p{N}` match the same Unicode classes Go's RE2
14
+ * matches for `\p{L}`/`\p{N}`.
15
+ */
16
+ const anchorStripRE = /[^\p{L}\p{N}\- _]+/gu;
17
+ /**
18
+ * Matches a run of Unicode whitespace. Go's makeAnchor uses `unicode.IsSpace`,
19
+ * whose rune set is exactly the Unicode White_Space property, so `\p{White_Space}`
20
+ * is the faithful equivalent. (Note: only ASCII space survives the strip step,
21
+ * since underscore is not whitespace and other separators are stripped — but the
22
+ * Unicode class is used to mirror Go exactly.)
23
+ */
24
+ const wsRunRE = /\p{White_Space}+/gu;
25
+ /**
26
+ * Returns the GitHub-style anchor slug for a heading text, applying the locked
27
+ * transformation:
28
+ * 1. Strip characters matching anchorStripRE.
29
+ * 2. Lowercase.
30
+ * 3. Replace whitespace runs with a single "-".
31
+ */
32
+ export function makeAnchor(heading) {
33
+ // Remove non-(letter, digit, hyphen, space, underscore) characters.
34
+ let s = heading.replace(anchorStripRE, "");
35
+ // Lowercase.
36
+ s = s.toLowerCase();
37
+ // Replace whitespace runs (space treated as separator) with "-".
38
+ // Underscores are preserved (not whitespace), matching Go.
39
+ return s.replace(wsRunRE, "-");
40
+ }
41
+ /**
42
+ * Tracks per-file anchor collision counts and returns collision-suffixed
43
+ * anchors. Construct one per Extract call (per file).
44
+ */
45
+ export class AnchorTracker {
46
+ seen = new Map();
47
+ /**
48
+ * Returns the anchor for the given heading text, appending "-1", "-2", … for
49
+ * duplicate headings within the same file. The first occurrence is unsuffixed.
50
+ */
51
+ assign(heading) {
52
+ const base = makeAnchor(heading);
53
+ const count = this.seen.get(base) ?? 0;
54
+ this.seen.set(base, count + 1);
55
+ if (count === 0) {
56
+ return base;
57
+ }
58
+ return `${base}-${count}`;
59
+ }
60
+ }