docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/README.md +242 -0
  2. package/dist/apispec/apispec.js +401 -0
  3. package/dist/apispec/apispec.test.js +444 -0
  4. package/dist/apispec/errors.js +17 -0
  5. package/dist/apispec/index.js +2 -0
  6. package/dist/check/doclinks.js +167 -0
  7. package/dist/check/index.js +8 -0
  8. package/dist/check/run.js +391 -0
  9. package/dist/check/run.test.js +513 -0
  10. package/dist/check/suggest.js +134 -0
  11. package/dist/check/suggest.test.js +92 -0
  12. package/dist/check/tokens.js +125 -0
  13. package/dist/cmd/main.js +330 -0
  14. package/dist/cmd/main.test.js +422 -0
  15. package/dist/codeq/cache.js +71 -0
  16. package/dist/codeq/cache.test.js +67 -0
  17. package/dist/codeq/errors.js +52 -0
  18. package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
  19. package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
  20. package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
  21. package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
  22. package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
  23. package/dist/codeq/index.js +11 -0
  24. package/dist/codeq/resolve.test.js +109 -0
  25. package/dist/codeq/resolver.js +128 -0
  26. package/dist/codeq/resolver.test.js +124 -0
  27. package/dist/codeq/resolvers/go.js +242 -0
  28. package/dist/codeq/resolvers/go.test.js +143 -0
  29. package/dist/codeq/resolvers/java.js +349 -0
  30. package/dist/codeq/resolvers/java.test.js +138 -0
  31. package/dist/codeq/resolvers/java_queries.js +63 -0
  32. package/dist/codeq/resolvers/javascript.js +412 -0
  33. package/dist/codeq/resolvers/javascript.test.js +125 -0
  34. package/dist/codeq/resolvers/javascript_queries.js +46 -0
  35. package/dist/codeq/resolvers/typescript.js +366 -0
  36. package/dist/codeq/resolvers/typescript.test.js +180 -0
  37. package/dist/codeq/resolvers/typescript_queries.js +78 -0
  38. package/dist/codeq/signature.js +50 -0
  39. package/dist/codeq/signature.test.js +50 -0
  40. package/dist/codeq/suggest.js +96 -0
  41. package/dist/codeq/treesitter.js +122 -0
  42. package/dist/codeq/treesitter.test.js +118 -0
  43. package/dist/config/config.js +74 -0
  44. package/dist/config/config.test.js +98 -0
  45. package/dist/config/fs.js +116 -0
  46. package/dist/config/glob.js +82 -0
  47. package/dist/config/glob.test.js +61 -0
  48. package/dist/config/index.js +4 -0
  49. package/dist/dedup/analyzer/analyzer.js +533 -0
  50. package/dist/dedup/analyzer/analyzer.test.js +530 -0
  51. package/dist/dedup/analyzer/canonical.js +74 -0
  52. package/dist/dedup/analyzer/canonical.test.js +70 -0
  53. package/dist/dedup/analyzer/cosine_clusters.js +169 -0
  54. package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
  55. package/dist/dedup/analyzer/distinctive.js +85 -0
  56. package/dist/dedup/analyzer/distinctive.test.js +49 -0
  57. package/dist/dedup/analyzer/exact_clusters.js +63 -0
  58. package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
  59. package/dist/dedup/analyzer/index.js +14 -0
  60. package/dist/dedup/analyzer/multiplicity.js +110 -0
  61. package/dist/dedup/analyzer/multiplicity.test.js +123 -0
  62. package/dist/dedup/analyzer/order.js +22 -0
  63. package/dist/dedup/analyzer/partial_overlaps.js +65 -0
  64. package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
  65. package/dist/dedup/analyzer/preview.js +84 -0
  66. package/dist/dedup/analyzer/preview.test.js +46 -0
  67. package/dist/dedup/analyzer/safety.js +27 -0
  68. package/dist/dedup/analyzer/safety.test.js +39 -0
  69. package/dist/dedup/config.js +18 -0
  70. package/dist/dedup/configload.js +299 -0
  71. package/dist/dedup/configload.test.js +410 -0
  72. package/dist/dedup/dedup.index.test.js +203 -0
  73. package/dist/dedup/dedup.js +143 -0
  74. package/dist/dedup/dedup.test.js +212 -0
  75. package/dist/dedup/dedupcfg/config.js +112 -0
  76. package/dist/dedup/dedupcfg/config.test.js +70 -0
  77. package/dist/dedup/dedupcfg/index.js +1 -0
  78. package/dist/dedup/deduptypes/index.js +1 -0
  79. package/dist/dedup/deduptypes/types.js +9 -0
  80. package/dist/dedup/deduptypes/types.test.js +34 -0
  81. package/dist/dedup/embedder/cache.js +23 -0
  82. package/dist/dedup/embedder/cache.test.js +50 -0
  83. package/dist/dedup/embedder/constants.js +10 -0
  84. package/dist/dedup/embedder/embedder.js +76 -0
  85. package/dist/dedup/embedder/embedder.mock.test.js +128 -0
  86. package/dist/dedup/embedder/embedder.test.js +96 -0
  87. package/dist/dedup/embedder/errors.js +20 -0
  88. package/dist/dedup/embedder/errors.test.js +35 -0
  89. package/dist/dedup/embedder/index.js +4 -0
  90. package/dist/dedup/embedder/session.js +78 -0
  91. package/dist/dedup/embedder/session.test.js +172 -0
  92. package/dist/dedup/gitignore.js +97 -0
  93. package/dist/dedup/gitignore.test.js +98 -0
  94. package/dist/dedup/index.js +11 -0
  95. package/dist/dedup/indexdb/errors.js +48 -0
  96. package/dist/dedup/indexdb/index.js +6 -0
  97. package/dist/dedup/indexdb/indexdb.js +302 -0
  98. package/dist/dedup/indexdb/indexdb.test.js +739 -0
  99. package/dist/dedup/indexdb/load.js +110 -0
  100. package/dist/dedup/indexdb/migrations.js +58 -0
  101. package/dist/dedup/indexdb/schema.js +83 -0
  102. package/dist/dedup/indexer/index.js +9 -0
  103. package/dist/dedup/indexer/indexer.js +501 -0
  104. package/dist/dedup/indexer/indexer.test.js +510 -0
  105. package/dist/dedup/indexer/links.js +89 -0
  106. package/dist/dedup/mdsection/anchor.js +60 -0
  107. package/dist/dedup/mdsection/anchor.test.js +39 -0
  108. package/dist/dedup/mdsection/blocks.js +409 -0
  109. package/dist/dedup/mdsection/blocks.test.js +359 -0
  110. package/dist/dedup/mdsection/index.js +4 -0
  111. package/dist/dedup/mdsection/parse.js +21 -0
  112. package/dist/dedup/mdsection/section.js +234 -0
  113. package/dist/dedup/mdsection/section.test.js +221 -0
  114. package/dist/dedup/report/floatfmt.js +71 -0
  115. package/dist/dedup/report/floatfmt.test.js +42 -0
  116. package/dist/dedup/report/index.js +8 -0
  117. package/dist/dedup/report/quote.js +77 -0
  118. package/dist/dedup/report/quote.test.js +67 -0
  119. package/dist/dedup/report/text.js +251 -0
  120. package/dist/dedup/report/text.test.js +420 -0
  121. package/dist/dedup/report_types.js +8 -0
  122. package/dist/dedup/sectionid/index.js +1 -0
  123. package/dist/dedup/sectionid/sectionid.js +16 -0
  124. package/dist/dedup/sectionid/sectionid.test.js +49 -0
  125. package/dist/guard/api/errors.js +12 -0
  126. package/dist/guard/api/index.js +2 -0
  127. package/dist/guard/api/parser.js +81 -0
  128. package/dist/guard/api/parser.test.js +58 -0
  129. package/dist/guard/api/types.js +1 -0
  130. package/dist/guard/code/errors.js +16 -0
  131. package/dist/guard/code/index.js +2 -0
  132. package/dist/guard/code/parser.js +54 -0
  133. package/dist/guard/code/parser.test.js +111 -0
  134. package/dist/guard/code/types.js +6 -0
  135. package/dist/index.js +1 -0
  136. package/dist/index.test.js +5 -0
  137. package/dist/repo/boundary.js +92 -0
  138. package/dist/repo/boundary.test.js +65 -0
  139. package/dist/repo/errors.js +56 -0
  140. package/dist/repo/errors.test.js +85 -0
  141. package/dist/repo/exists.test.js +72 -0
  142. package/dist/repo/filename.js +46 -0
  143. package/dist/repo/filename.test.js +39 -0
  144. package/dist/repo/fs.js +53 -0
  145. package/dist/repo/index.js +7 -0
  146. package/dist/repo/overlay.js +36 -0
  147. package/dist/repo/overlay.test.js +80 -0
  148. package/dist/repo/repo.js +353 -0
  149. package/dist/repo/repo.test.js +255 -0
  150. package/dist/repo/testutil.js +27 -0
  151. package/dist/repo/write.test.js +125 -0
  152. package/dist/report/color.js +73 -0
  153. package/dist/report/index.js +1 -0
  154. package/dist/report/report.js +112 -0
  155. package/dist/report/report.test.js +368 -0
  156. package/dist/violation/index.js +1 -0
  157. package/dist/violation/types.js +22 -0
  158. package/dist/violation/types.test.js +70 -0
  159. package/package.json +48 -0
@@ -0,0 +1,410 @@
1
+ /**
2
+ * Behavior-encoding tests for the dedup config facade, ported from
3
+ * internal/dedup/config_test.go (locked defaults) and
4
+ * internal/dedup/configload_test.go (overlay + validation).
5
+ *
6
+ * WHY: the locked defaults are a parity contract against the Go binary and the
7
+ * Python POC — any drift silently changes every analysis. The overlay rules
8
+ * (scalars override, lists replace, *_extra appends) and the validation ranges
9
+ * are the user-facing config behavior; a regression here either ignores a user's
10
+ * config.yml or admits an out-of-range value that misbehaves deep in the
11
+ * analyzer. Type-mismatch must throw a matchable error (Go's *yaml.TypeError →
12
+ * YAMLTypeError) so the CLI can distinguish "your YAML is wrong" from a bug.
13
+ */
14
+ import { mkdtempSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
15
+ import { tmpdir } from "node:os";
16
+ import { join } from "node:path";
17
+ import { afterEach, describe, expect, it } from "vitest";
18
+ import { Default } from "./config.js";
19
+ import { Load, YAMLTypeError } from "./configload.js";
20
+ const tmpDirs = [];
21
+ afterEach(() => {
22
+ for (const d of tmpDirs.splice(0)) {
23
+ rmSync(d, { recursive: true, force: true });
24
+ }
25
+ });
26
+ /** newRepo returns a fresh temp repo root with no .docgov/dedup. */
27
+ function newRepo() {
28
+ const dir = mkdtempSync(join(tmpdir(), "dedup-cfg-"));
29
+ tmpDirs.push(dir);
30
+ return dir;
31
+ }
32
+ /** writeConfigYML writes config.yml under repoRoot/.docgov/dedup. */
33
+ function writeConfigYML(repoRoot, yaml) {
34
+ const dedupDir = join(repoRoot, ".docgov", "dedup");
35
+ mkdirSync(dedupDir, { recursive: true });
36
+ writeFileSync(join(dedupDir, "config.yml"), yaml);
37
+ }
38
+ describe("Default", () => {
39
+ // WHY: these scalars are the locked v1 tuning; drift changes every run.
40
+ it("has the locked scalar values", () => {
41
+ const cfg = Default();
42
+ expect(cfg.Markdown.min_prose_words).toBe(10);
43
+ expect(cfg.Markdown.heading_token_min_len).toBe(3);
44
+ expect(cfg.Markdown.hidden_dir_prefix).toBe(".");
45
+ expect(cfg.Indexer.embed_progress_threshold).toBe(100);
46
+ expect(cfg.Analyzer.thresh_high).toBe(0.93);
47
+ expect(cfg.Analyzer.thresh_maybe).toBe(0.86);
48
+ expect(cfg.Analyzer.distinctive_abs_min).toBe(3);
49
+ expect(cfg.Analyzer.distinctive_pct_of_headings).toBe(0.03);
50
+ expect(cfg.Report.preview_chars).toBe(280);
51
+ expect(cfg.Report.preview_word_ratio).toBe(0.6);
52
+ expect(cfg.Report.wrap_cols).toBe(72);
53
+ expect(cfg.Report.separator).toBe("---");
54
+ expect(cfg.Embedder.batch_size).toBe(32);
55
+ });
56
+ // WHY: the ignored-dirs / stopword / differentiator / path lists are locked
57
+ // and load-bearing — the indexer skips dirs and the analyzer's safety net
58
+ // depends on these exact entries and order.
59
+ it("has the locked list values", () => {
60
+ const cfg = Default();
61
+ expect(cfg.Markdown.ignored_dirs).toEqual([
62
+ ".git", "node_modules", "vendor", "dist", "build",
63
+ ".next", ".cache", ".docgov", "dedup-poc", ".venv",
64
+ ]);
65
+ expect(cfg.Analyzer.universal_stopwords).toEqual([
66
+ "the", "a", "an", "of", "and", "or", "to", "with",
67
+ "for", "in", "on", "is", "are", "be", "by", "from", "as", "at",
68
+ ]);
69
+ expect(cfg.Analyzer.differentiators).toEqual([
70
+ ["calendar days", "business days"],
71
+ ["sync", "async"],
72
+ ["create", "cancel"],
73
+ ["source", "target"],
74
+ ["inbound", "outbound"],
75
+ ["success", "failure"],
76
+ ["old flow", "new flow"],
77
+ ["deprecated", "current"],
78
+ ]);
79
+ expect(cfg.Analyzer.path_priority).toEqual([
80
+ "docs/concepts/", "docs/architecture/", "docs/design/",
81
+ ]);
82
+ expect(cfg.Analyzer.heading_blacklist).toEqual([
83
+ "related", "template rendering", "template sample",
84
+ ]);
85
+ expect(cfg.Analyzer.path_blacklist).toEqual([
86
+ "changelog", "migration", "deprecated", "old", "legacy", "temporary",
87
+ ]);
88
+ expect(cfg.Indexer.external_url_prefixes).toEqual([
89
+ "http://", "https://", "mailto:",
90
+ ]);
91
+ });
92
+ });
93
+ describe("Load", () => {
94
+ // WHY: no config.yml must yield the locked defaults, not an error or partial.
95
+ it("returns Default() when no YAML exists", async () => {
96
+ const cfg = await Load(newRepo());
97
+ const want = Default();
98
+ expect(cfg.Analyzer.thresh_high).toBe(want.Analyzer.thresh_high);
99
+ expect(cfg.Markdown.min_prose_words).toBe(want.Markdown.min_prose_words);
100
+ });
101
+ // WHY: a single overridden scalar must change ONLY that field; everything else
102
+ // must stay at default (overlay, not replace-whole-config).
103
+ it("overlays a single scalar and leaves others at default", async () => {
104
+ const repo = newRepo();
105
+ writeConfigYML(repo, "analyzer:\n thresh_high: 0.95\n");
106
+ const cfg = await Load(repo);
107
+ expect(cfg.Analyzer.thresh_high).toBe(0.95);
108
+ const want = Default();
109
+ expect(cfg.Analyzer.thresh_maybe).toBe(want.Analyzer.thresh_maybe);
110
+ expect(cfg.Markdown.min_prose_words).toBe(want.Markdown.min_prose_words);
111
+ });
112
+ // WHY: a list set in config.yml REPLACES the default list entirely (it does
113
+ // not merge) — the user is opting out of the defaults for that field.
114
+ it("replaces a list field entirely when set", async () => {
115
+ const repo = newRepo();
116
+ writeConfigYML(repo, 'analyzer:\n heading_blacklist: ["custom"]\n');
117
+ const cfg = await Load(repo);
118
+ expect(cfg.Analyzer.heading_blacklist).toEqual(["custom"]);
119
+ });
120
+ // WHY: *_extra fields APPEND to the (possibly replaced) base list — the user
121
+ // adds to the defaults without restating them.
122
+ it("appends *_extra entries to the base list", async () => {
123
+ const repo = newRepo();
124
+ writeConfigYML(repo, 'markdown:\n ignored_dirs_extra: ["mydir"]\n');
125
+ const cfg = await Load(repo);
126
+ const defaults = Default();
127
+ expect(cfg.Markdown.ignored_dirs).toHaveLength(defaults.Markdown.ignored_dirs.length + 1);
128
+ expect(cfg.Markdown.ignored_dirs.at(-1)).toBe("mydir");
129
+ });
130
+ // WHY: malformed YAML must be a hard error — never a silent fallback to
131
+ // defaults that would mask the user's broken config.
132
+ it("throws on malformed YAML (tab indentation)", async () => {
133
+ const repo = newRepo();
134
+ writeConfigYML(repo, "analyzer:\n\tthresh_high: 0.95\n");
135
+ await expect(Load(repo)).rejects.toThrow();
136
+ });
137
+ // WHY: a type mismatch must throw a MATCHABLE error (Go's *yaml.TypeError →
138
+ // instanceof YAMLTypeError) so the CLI can report "your config.yml has the
139
+ // wrong type" distinctly from an internal bug. A regression to a plain throw
140
+ // (or silent coercion) breaks that distinction.
141
+ it("throws YAMLTypeError on a type mismatch", async () => {
142
+ const repo = newRepo();
143
+ writeConfigYML(repo, 'analyzer:\n thresh_high: "not a number"\n');
144
+ await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
145
+ });
146
+ // WHY: validation must reject thresh_high >= 1 (the cosine cap can never be 1).
147
+ it("rejects thresh_high >= 1", async () => {
148
+ const repo = newRepo();
149
+ writeConfigYML(repo, "analyzer:\n thresh_high: 1.0\n");
150
+ await expect(Load(repo)).rejects.toThrow();
151
+ });
152
+ // WHY: thresh_maybe must stay strictly below thresh_high or the MAYBE band
153
+ // collapses.
154
+ it("rejects thresh_maybe >= thresh_high", async () => {
155
+ const repo = newRepo();
156
+ writeConfigYML(repo, "analyzer:\n thresh_maybe: 0.97\n thresh_high: 0.95\n");
157
+ await expect(Load(repo)).rejects.toThrow();
158
+ });
159
+ // WHY: validation runs AFTER the overlay is applied (not on defaults) so a
160
+ // valid-syntax overlay that produces an invalid value is still caught.
161
+ it("validates after overlay: rejects min_prose_words = 0", async () => {
162
+ const repo = newRepo();
163
+ writeConfigYML(repo, "markdown:\n min_prose_words: 0\n");
164
+ await expect(Load(repo)).rejects.toThrow();
165
+ });
166
+ // WHY: same — an overlaid batch_size of 0 must fail at load, not later.
167
+ it("validates after overlay: rejects batch_size = 0", async () => {
168
+ const repo = newRepo();
169
+ writeConfigYML(repo, "embedder:\n batch_size: 0\n");
170
+ await expect(Load(repo)).rejects.toThrow();
171
+ });
172
+ // WHY: each block.* knob became user-overridable; an out-of-range value (a
173
+ // cosine_threshold that can never match, or a zero min_words/cap) must fail
174
+ // loudly at load rather than silently misbehaving inside indexer/analyzer.
175
+ it.each([
176
+ ["min_words = 0", "block:\n min_words: 0\n"],
177
+ ["table_min_rows = 0", "block:\n table_min_rows: 0\n"],
178
+ ["cosine_threshold >= 1", "block:\n cosine_threshold: 1.0\n"],
179
+ ["cosine_threshold <= 0", "block:\n cosine_threshold: 0\n"],
180
+ ["multiplicity_cap = 0", "block:\n multiplicity_cap: 0\n"],
181
+ ])("rejects bad block config: %s", async (_name, yaml) => {
182
+ const repo = newRepo();
183
+ writeConfigYML(repo, yaml);
184
+ await expect(Load(repo)).rejects.toThrow();
185
+ });
186
+ // WHY: an empty / null document is a valid file that means "no overrides"; it
187
+ // must yield defaults, not error — a user may leave a placeholder config.yml.
188
+ it("treats an empty/null document as a no-op overlay (defaults)", async () => {
189
+ const repo = newRepo();
190
+ writeConfigYML(repo, "");
191
+ const cfg = await Load(repo);
192
+ expect(cfg.Analyzer.thresh_high).toBe(Default().Analyzer.thresh_high);
193
+ });
194
+ // WHY: the top-level document must be a mapping; a scalar or sequence at the
195
+ // root is a structural mistake the CLI must report as a YAML type error, not
196
+ // silently ignore.
197
+ it("throws YAMLTypeError when the root document is not a mapping", async () => {
198
+ const repo = newRepo();
199
+ writeConfigYML(repo, "- a\n- b\n"); // a sequence, not a mapping
200
+ await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
201
+ });
202
+ // WHY: a section key (e.g. markdown:) whose value is a scalar instead of a
203
+ // sub-mapping is malformed; it must throw a matchable type error rather than
204
+ // be coerced or skipped.
205
+ it("throws YAMLTypeError when a section is not a mapping", async () => {
206
+ const repo = newRepo();
207
+ writeConfigYML(repo, "markdown: 5\n");
208
+ await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
209
+ });
210
+ // WHY: a present-but-empty section (markdown: with no children) is a no-op
211
+ // overlay; it must leave every field at default rather than wipe the section.
212
+ it("treats a present-but-empty section as no overlay", async () => {
213
+ const repo = newRepo();
214
+ writeConfigYML(repo, "markdown:\n");
215
+ const cfg = await Load(repo);
216
+ expect(cfg.Markdown.min_prose_words).toBe(Default().Markdown.min_prose_words);
217
+ // ignored_dirs must be unchanged (no spurious *_extra append of nothing).
218
+ expect(cfg.Markdown.ignored_dirs).toEqual(Default().Markdown.ignored_dirs);
219
+ });
220
+ // WHY: unknown keys are tolerated (yaml.v3 has no strict mode); a typo'd or
221
+ // forward-compatible key must not fail the load, so older binaries read newer
222
+ // configs.
223
+ it("ignores unknown keys", async () => {
224
+ const repo = newRepo();
225
+ writeConfigYML(repo, "markdown:\n bogus_key: 1\nfuture_section:\n x: 1\n");
226
+ const cfg = await Load(repo);
227
+ expect(cfg.Markdown.min_prose_words).toBe(Default().Markdown.min_prose_words);
228
+ });
229
+ // WHY: every section's scalar overlays must reach the right field; if the
230
+ // overlay mis-wired a section, a user's tuning would silently apply elsewhere
231
+ // or nowhere. This exercises the indexer/report/embedder/block scalar paths.
232
+ it("overlays scalars across indexer, report, embedder, and block sections", async () => {
233
+ const repo = newRepo();
234
+ writeConfigYML(repo, [
235
+ "indexer:",
236
+ " embed_progress_threshold: 5",
237
+ " max_workers: 2",
238
+ "report:",
239
+ " preview_chars: 100",
240
+ " preview_word_ratio: 0.5",
241
+ " wrap_cols: 80",
242
+ " separator: '***'",
243
+ "embedder:",
244
+ " batch_size: 16",
245
+ "block:",
246
+ " min_words: 4",
247
+ " table_min_rows: 2",
248
+ " cosine_threshold: 0.8",
249
+ " multiplicity_cap: 5",
250
+ "",
251
+ ].join("\n"));
252
+ const cfg = await Load(repo);
253
+ expect(cfg.Indexer.embed_progress_threshold).toBe(5);
254
+ expect(cfg.Indexer.max_workers).toBe(2);
255
+ expect(cfg.Report.preview_chars).toBe(100);
256
+ expect(cfg.Report.preview_word_ratio).toBe(0.5);
257
+ expect(cfg.Report.wrap_cols).toBe(80);
258
+ expect(cfg.Report.separator).toBe("***");
259
+ expect(cfg.Embedder.batch_size).toBe(16);
260
+ expect(cfg.Block.min_words).toBe(4);
261
+ expect(cfg.Block.table_min_rows).toBe(2);
262
+ expect(cfg.Block.cosine_threshold).toBe(0.8);
263
+ expect(cfg.Block.multiplicity_cap).toBe(5);
264
+ });
265
+ // WHY: the analyzer's remaining scalar/list knobs (distinctive bounds, the
266
+ // stopword / path lists) must overlay correctly; these gate the analyzer's
267
+ // distinctiveness and priority logic.
268
+ it("overlays the remaining analyzer scalars and list fields", async () => {
269
+ const repo = newRepo();
270
+ writeConfigYML(repo, [
271
+ "analyzer:",
272
+ " distinctive_abs_min: 5",
273
+ " distinctive_pct_of_headings: 0.1",
274
+ " universal_stopwords: ['x', 'y']",
275
+ " path_priority: ['docs/x/']",
276
+ " path_blacklist: ['legacy']",
277
+ "",
278
+ ].join("\n"));
279
+ const cfg = await Load(repo);
280
+ expect(cfg.Analyzer.distinctive_abs_min).toBe(5);
281
+ expect(cfg.Analyzer.distinctive_pct_of_headings).toBe(0.1);
282
+ expect(cfg.Analyzer.universal_stopwords).toEqual(["x", "y"]);
283
+ expect(cfg.Analyzer.path_priority).toEqual(["docs/x/"]);
284
+ expect(cfg.Analyzer.path_blacklist).toEqual(["legacy"]);
285
+ });
286
+ // WHY: markdown's own scalar/list knobs (hidden_dir_prefix,
287
+ // heading_token_min_len, ignored_dirs replace) must overlay; the indexer's
288
+ // dir-skip and heading tokenizer depend on these.
289
+ it("overlays markdown scalar and list fields", async () => {
290
+ const repo = newRepo();
291
+ writeConfigYML(repo, [
292
+ "markdown:",
293
+ " heading_token_min_len: 4",
294
+ " hidden_dir_prefix: '_'",
295
+ " ignored_dirs: ['only']",
296
+ "",
297
+ ].join("\n"));
298
+ const cfg = await Load(repo);
299
+ expect(cfg.Markdown.heading_token_min_len).toBe(4);
300
+ expect(cfg.Markdown.hidden_dir_prefix).toBe("_");
301
+ expect(cfg.Markdown.ignored_dirs).toEqual(["only"]);
302
+ });
303
+ // WHY: indexer.external_url_prefixes is a list that replaces; the link
304
+ // resolver uses it to skip external URLs, so a user override must take effect.
305
+ it("replaces indexer.external_url_prefixes", async () => {
306
+ const repo = newRepo();
307
+ writeConfigYML(repo, "indexer:\n external_url_prefixes: ['ftp://']\n");
308
+ const cfg = await Load(repo);
309
+ expect(cfg.Indexer.external_url_prefixes).toEqual(["ftp://"]);
310
+ });
311
+ // WHY: differentiators are pairs of strings the analyzer uses to keep opposite
312
+ // concepts apart; a well-formed pair list must parse into [a,b] tuples.
313
+ it("parses analyzer.differentiators as string pairs", async () => {
314
+ const repo = newRepo();
315
+ writeConfigYML(repo, "analyzer:\n differentiators:\n - ['a', 'b']\n - ['c', 'd']\n");
316
+ const cfg = await Load(repo);
317
+ expect(cfg.Analyzer.differentiators).toEqual([
318
+ ["a", "b"],
319
+ ["c", "d"],
320
+ ]);
321
+ });
322
+ // WHY: a differentiator entry that is not a 2-element sequence is malformed;
323
+ // it must throw a matchable type error so the user fixes the pair, not a
324
+ // confusing failure later in the analyzer.
325
+ it("throws YAMLTypeError when a differentiator is not a 2-element sequence", async () => {
326
+ const repo = newRepo();
327
+ writeConfigYML(repo, "analyzer:\n differentiators:\n - ['only-one']\n");
328
+ await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
329
+ });
330
+ // WHY: a differentiator pair whose elements are not both strings is malformed;
331
+ // must be a matchable type error.
332
+ it("throws YAMLTypeError when a differentiator element is not a string", async () => {
333
+ const repo = newRepo();
334
+ writeConfigYML(repo, "analyzer:\n differentiators:\n - ['a', 5]\n");
335
+ await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
336
+ });
337
+ // WHY: a list field given a non-sequence value is a type error; a scalar where
338
+ // a list is expected must throw matchably, not coerce.
339
+ it("throws YAMLTypeError when a list field is given a scalar", async () => {
340
+ const repo = newRepo();
341
+ writeConfigYML(repo, "markdown:\n ignored_dirs: oops\n");
342
+ await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
343
+ });
344
+ // WHY: a list element of the wrong type (number inside a string list) must be
345
+ // caught per-element with the index in the message, so the user finds the bad
346
+ // entry.
347
+ it("throws YAMLTypeError when a list element is the wrong type", async () => {
348
+ const repo = newRepo();
349
+ writeConfigYML(repo, "markdown:\n ignored_dirs: ['ok', 5]\n");
350
+ await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
351
+ });
352
+ // WHY: a string field given a non-string (number) must throw matchably; the
353
+ // separator is rendered verbatim into reports, so a number would corrupt them.
354
+ it("throws YAMLTypeError when a string field is given a number", async () => {
355
+ const repo = newRepo();
356
+ writeConfigYML(repo, "report:\n separator: 5\n");
357
+ await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
358
+ });
359
+ // WHY: heading_blacklist_extra appends to the (possibly replaced) base list,
360
+ // mirroring ignored_dirs_extra; the analyzer's heading filter must see both
361
+ // the base and the user's additions.
362
+ it("appends analyzer.heading_blacklist_extra to the base list", async () => {
363
+ const repo = newRepo();
364
+ writeConfigYML(repo, "analyzer:\n heading_blacklist_extra: ['custom-h']\n");
365
+ const cfg = await Load(repo);
366
+ const base = Default().Analyzer.heading_blacklist;
367
+ expect(cfg.Analyzer.heading_blacklist).toHaveLength(base.length + 1);
368
+ expect(cfg.Analyzer.heading_blacklist.at(-1)).toBe("custom-h");
369
+ });
370
+ // WHY: *_extra appends to the REPLACED list, not the default — a user who both
371
+ // replaces and extends must get exactly their replacement plus their extras.
372
+ it("appends *_extra onto a replaced base list (not the default)", async () => {
373
+ const repo = newRepo();
374
+ writeConfigYML(repo, "markdown:\n ignored_dirs: ['base']\n ignored_dirs_extra: ['more']\n");
375
+ const cfg = await Load(repo);
376
+ expect(cfg.Markdown.ignored_dirs).toEqual(["base", "more"]);
377
+ });
378
+ // WHY: validation guards the remaining ranges too; an out-of-range value for
379
+ // any of these must fail at load rather than misbehave deep in the pipeline.
380
+ it.each([
381
+ ["heading_token_min_len = 0", "markdown:\n heading_token_min_len: 0\n"],
382
+ ["distinctive_abs_min = 0", "analyzer:\n distinctive_abs_min: 0\n"],
383
+ [
384
+ "distinctive_pct_of_headings = 0",
385
+ "analyzer:\n distinctive_pct_of_headings: 0\n",
386
+ ],
387
+ [
388
+ "distinctive_pct_of_headings > 1",
389
+ "analyzer:\n distinctive_pct_of_headings: 1.5\n",
390
+ ],
391
+ ["preview_chars = 0", "report:\n preview_chars: 0\n"],
392
+ ["wrap_cols = 0", "report:\n wrap_cols: 0\n"],
393
+ ["thresh_maybe = 0", "analyzer:\n thresh_maybe: 0\n"],
394
+ ])("rejects out-of-range value: %s", async (_name, yaml) => {
395
+ const repo = newRepo();
396
+ writeConfigYML(repo, yaml);
397
+ await expect(Load(repo)).rejects.toThrow();
398
+ });
399
+ // WHY: an I/O fault other than "file missing" (here: config.yml is a
400
+ // directory, which yields EISDIR on read) must surface as a wrapped error with
401
+ // the cause attached — never silently fall back to defaults (Rule 12).
402
+ it("throws (not falls back) when config.yml cannot be read for a non-ENOENT reason", async () => {
403
+ const repo = newRepo();
404
+ const dedupDir = join(repo, ".docgov", "dedup");
405
+ mkdirSync(dedupDir, { recursive: true });
406
+ // Make config.yml a directory so reading it fails with EISDIR, not ENOENT.
407
+ mkdirSync(join(dedupDir, "config.yml"));
408
+ await expect(Load(repo)).rejects.toThrow(/configload\.Load: read/);
409
+ });
410
+ });
@@ -0,0 +1,203 @@
1
+ // Drives the REAL dedup Index() facade end-to-end with the transformers model
2
+ // mocked, so the whole Index path (config load, docs walk, section/block
3
+ // extraction, embed via the real Embedder/Session, persist to index.db) runs
4
+ // against fake vectors with no ~1GB download.
5
+ //
6
+ // vi.mock is hoisted above the imports and intercepts the dynamic
7
+ // `await import("@huggingface/transformers")` inside session.ts, which
8
+ // Embedder.newEmbedder (called by Index) reaches through. The real-embedder
9
+ // Index path is otherwise only covered by the env-gated smoke test in
10
+ // dedup.test.ts; this is the deterministic coverage of the facade.
11
+ import { existsSync, mkdtempSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
12
+ import { tmpdir } from "node:os";
13
+ import { dirname, join } from "node:path";
14
+ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
15
+ import { Dimension } from "./embedder/constants.js";
16
+ import { ErrIndexMissing } from "./indexdb/index.js";
17
+ import { Analyze, Index } from "./dedup.js";
18
+ // pipelineThrows steers the mocked model factory to fail, exercising Index's
19
+ // "new embedder" failure path.
20
+ const mockState = vi.hoisted(() => ({
21
+ env: { cacheDir: "" },
22
+ pipelineThrows: null,
23
+ }));
24
+ // l2row: a deterministic unit vector with its 1.0 at (i % dim). Orthogonal
25
+ // across batch positions so distinct sections never collapse into a HIGH group;
26
+ // L2-normalized so reshape's dimension guard passes and the analyzer's
27
+ // dot-product-as-cosine identity holds.
28
+ function l2row(i, dim) {
29
+ const v = new Array(dim).fill(0);
30
+ v[i % dim] = 1.0;
31
+ return v;
32
+ }
33
+ vi.mock("@huggingface/transformers", () => {
34
+ const env = mockState.env;
35
+ const pipeline = async (_task, _model) => {
36
+ if (mockState.pipelineThrows)
37
+ throw mockState.pipelineThrows;
38
+ return async (texts, _opts) => {
39
+ const rows = texts.map((_, i) => l2row(i, Dimension));
40
+ return {
41
+ data: new Float32Array(rows.flat()),
42
+ dims: [texts.length, Dimension],
43
+ tolist: () => rows,
44
+ };
45
+ };
46
+ };
47
+ return { pipeline, env };
48
+ });
49
+ // --- temp dir plumbing -----------------------------------------------------
50
+ const tmpDirs = [];
51
+ let savedModelCache;
52
+ function newRepoRoot() {
53
+ const dir = mkdtempSync(join(tmpdir(), "dedup-index-"));
54
+ tmpDirs.push(dir);
55
+ return dir;
56
+ }
57
+ function writeFiles(repoRoot, files) {
58
+ for (const [rel, content] of Object.entries(files)) {
59
+ const full = join(repoRoot, rel);
60
+ mkdirSync(dirname(full), { recursive: true });
61
+ writeFileSync(full, content);
62
+ }
63
+ }
64
+ function dbPathOf(repoRoot) {
65
+ return join(repoRoot, ".docgov", "dedup", "index.db");
66
+ }
67
+ const noProgress = () => { };
68
+ beforeEach(() => {
69
+ mockState.env.cacheDir = "";
70
+ mockState.pipelineThrows = null;
71
+ // Index() calls Embedder.newEmbedder() with no explicit cacheDir, which would
72
+ // resolve to the host ~/.cache. Point it at a throwaway temp dir so the test
73
+ // never writes to the real home cache.
74
+ savedModelCache = process.env.DOCGOV_MODEL_CACHE;
75
+ const modelCache = mkdtempSync(join(tmpdir(), "dedup-index-model-"));
76
+ tmpDirs.push(modelCache);
77
+ process.env.DOCGOV_MODEL_CACHE = modelCache;
78
+ });
79
+ afterEach(() => {
80
+ if (savedModelCache === undefined)
81
+ delete process.env.DOCGOV_MODEL_CACHE;
82
+ else
83
+ process.env.DOCGOV_MODEL_CACHE = savedModelCache;
84
+ for (const d of tmpDirs.splice(0))
85
+ rmSync(d, { recursive: true, force: true });
86
+ });
87
+ describe("dedup.Index (mocked model)", () => {
88
+ // WHY: this is the linchpin — it runs the entire real Index() facade (walk +
89
+ // extract + embed via the real Embedder/Session + persist) with only the
90
+ // model mocked. A regression anywhere on that path (config, walk, embed
91
+ // wiring, DB write) would change the returned stats or leave no DB; both are
92
+ // asserted. Then Analyze must read that persisted state without throwing
93
+ // ErrIndexMissing, proving the index round-trips.
94
+ it("indexes a docs corpus, persists the DB, and Analyze reads it back", async () => {
95
+ const repoRoot = newRepoRoot();
96
+ const aIntro = "Alpha covers the ingestion side of the system, including the upload queue, the validation gateway, and the retry scheduler for failed batches.";
97
+ const bIntro = "Beta documents the reporting surface, covering scheduled exports, the metrics rollup job, and the long-term archival of historical aggregates.";
98
+ writeFiles(repoRoot, {
99
+ "docs/a.md": `## Alpha Overview\n\n${aIntro}\n`,
100
+ "docs/b.md": `## Beta Overview\n\n${bIntro}\n`,
101
+ });
102
+ const stats = await Index(repoRoot, noProgress);
103
+ // Two eligible sections, both newly embedded this run, nothing pruned from
104
+ // a fresh DB.
105
+ expect(stats.sections).toBe(2);
106
+ expect(stats.embedded).toBe(2);
107
+ expect(stats.pruned).toBe(0);
108
+ // The facade persisted the index to its canonical location.
109
+ expect(existsSync(dbPathOf(repoRoot))).toBe(true);
110
+ // Analyze over the just-built index resolves (not ErrIndexMissing). The two
111
+ // sections are orthogonal, so there are no HIGH duplicate groups.
112
+ const report = await Analyze(repoRoot);
113
+ expect(report.HighGroups).toHaveLength(0);
114
+ });
115
+ // WHY: a second Index run over unchanged docs must re-embed nothing (the
116
+ // content hashes match the persisted rows) while still reporting the full
117
+ // live section count. This pins the incremental-embed wiring that keeps
118
+ // re-indexing cheap; a regression that re-embedded everything would silently
119
+ // make every run pay the full cost.
120
+ it("re-indexing unchanged docs embeds nothing the second time", async () => {
121
+ const repoRoot = newRepoRoot();
122
+ writeFiles(repoRoot, {
123
+ "docs/guide.md": "## Introduction\n\nThis is the introduction section of the guide and it covers the basic concepts and provides a thorough overview of the whole system.\n",
124
+ });
125
+ const first = await Index(repoRoot, noProgress);
126
+ expect(first.sections).toBe(1);
127
+ expect(first.embedded).toBe(1);
128
+ const second = await Index(repoRoot, noProgress);
129
+ expect(second.sections).toBe(1);
130
+ expect(second.embedded).toBe(0);
131
+ expect(second.pruned).toBe(0);
132
+ });
133
+ });
134
+ describe("dedup.Index error wiring", () => {
135
+ // WHY: Index must fail BEFORE touching the model when config is invalid, and
136
+ // it must wrap the cause so the CLI reports "load config" rather than a bare
137
+ // validation message. A regression that loaded defaults on bad config would
138
+ // silently mask the user's broken config.yml.
139
+ it("wraps a config-load failure and never reaches the embedder", async () => {
140
+ const repoRoot = newRepoRoot();
141
+ writeFiles(repoRoot, {
142
+ // thresh_high out of (0,1) range → validate() throws inside Load.
143
+ ".docgov/dedup/config.yml": "analyzer:\n thresh_high: 5\n",
144
+ "docs/a.md": "## A\n\nThis is a sufficiently long paragraph of prose used purely to make the section eligible.\n",
145
+ });
146
+ await expect(Index(repoRoot, noProgress)).rejects.toThrow(/dedup\.Index: load config/);
147
+ });
148
+ // WHY: a model download/init failure must propagate as a wrapped "new
149
+ // embedder" error (not a raw transformers error and not a silent empty
150
+ // index), so the CLI can tell the user the model could not be loaded.
151
+ it("wraps an embedder-init failure", async () => {
152
+ const repoRoot = newRepoRoot();
153
+ writeFiles(repoRoot, {
154
+ "docs/a.md": "## A\n\nThis is a sufficiently long paragraph of prose used purely to make the section eligible.\n",
155
+ });
156
+ mockState.pipelineThrows = new Error("model fetch failed");
157
+ await expect(Index(repoRoot, noProgress)).rejects.toThrow(/dedup\.Index: new embedder/);
158
+ });
159
+ // WHY: a corrupt DB file must surface as a wrapped "open db" error, not a
160
+ // confusing low-level SQLite message — this is the open-failure branch the
161
+ // facade exists to translate.
162
+ it("wraps an open-db failure on a corrupt index file", async () => {
163
+ const repoRoot = newRepoRoot();
164
+ writeFiles(repoRoot, {
165
+ ".docgov/dedup/index.db": "this is not a sqlite database",
166
+ "docs/a.md": "## A\n\nThis is a sufficiently long paragraph of prose used purely to make the section eligible.\n",
167
+ });
168
+ await expect(Index(repoRoot, noProgress)).rejects.toThrow(/dedup\.Index: open db/);
169
+ });
170
+ });
171
+ describe("dedup.Analyze error wiring", () => {
172
+ // WHY: a missing index must throw the matchable ErrIndexMissing sentinel so
173
+ // the CLI prompts "run index first" rather than reporting a generic failure.
174
+ it("throws ErrIndexMissing when no index DB exists", async () => {
175
+ const repoRoot = newRepoRoot();
176
+ await expect(Analyze(repoRoot)).rejects.toBeInstanceOf(ErrIndexMissing);
177
+ });
178
+ // WHY: with a DB present, Analyze must still validate config first and wrap a
179
+ // config-load failure — the same broken-config signal as Index, on the read
180
+ // path.
181
+ it("wraps a config-load failure when the index exists", async () => {
182
+ const repoRoot = newRepoRoot();
183
+ // Build a real index first so the DB exists, then corrupt the config.
184
+ writeFiles(repoRoot, {
185
+ "docs/a.md": "## A\n\nThis is a sufficiently long paragraph of prose used purely to make the section eligible.\n",
186
+ });
187
+ await Index(repoRoot, noProgress);
188
+ writeFiles(repoRoot, {
189
+ ".docgov/dedup/config.yml": "analyzer:\n thresh_high: 5\n",
190
+ });
191
+ await expect(Analyze(repoRoot)).rejects.toThrow(/dedup\.Analyze: load config/);
192
+ });
193
+ // WHY: a corrupt DB on the read path must surface as a wrapped "open db"
194
+ // error, not a raw SQLite message — the facade's open-failure translation on
195
+ // Analyze.
196
+ it("wraps an open-db failure on a corrupt index file", async () => {
197
+ const repoRoot = newRepoRoot();
198
+ writeFiles(repoRoot, {
199
+ ".docgov/dedup/index.db": "this is not a sqlite database",
200
+ });
201
+ await expect(Analyze(repoRoot)).rejects.toThrow(/dedup\.Analyze: open db/);
202
+ });
203
+ });