docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/README.md +242 -0
  2. package/dist/apispec/apispec.js +401 -0
  3. package/dist/apispec/apispec.test.js +444 -0
  4. package/dist/apispec/errors.js +17 -0
  5. package/dist/apispec/index.js +2 -0
  6. package/dist/check/doclinks.js +167 -0
  7. package/dist/check/index.js +8 -0
  8. package/dist/check/run.js +391 -0
  9. package/dist/check/run.test.js +513 -0
  10. package/dist/check/suggest.js +134 -0
  11. package/dist/check/suggest.test.js +92 -0
  12. package/dist/check/tokens.js +125 -0
  13. package/dist/cmd/main.js +330 -0
  14. package/dist/cmd/main.test.js +422 -0
  15. package/dist/codeq/cache.js +71 -0
  16. package/dist/codeq/cache.test.js +67 -0
  17. package/dist/codeq/errors.js +52 -0
  18. package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
  19. package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
  20. package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
  21. package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
  22. package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
  23. package/dist/codeq/index.js +11 -0
  24. package/dist/codeq/resolve.test.js +109 -0
  25. package/dist/codeq/resolver.js +128 -0
  26. package/dist/codeq/resolver.test.js +124 -0
  27. package/dist/codeq/resolvers/go.js +242 -0
  28. package/dist/codeq/resolvers/go.test.js +143 -0
  29. package/dist/codeq/resolvers/java.js +349 -0
  30. package/dist/codeq/resolvers/java.test.js +138 -0
  31. package/dist/codeq/resolvers/java_queries.js +63 -0
  32. package/dist/codeq/resolvers/javascript.js +412 -0
  33. package/dist/codeq/resolvers/javascript.test.js +125 -0
  34. package/dist/codeq/resolvers/javascript_queries.js +46 -0
  35. package/dist/codeq/resolvers/typescript.js +366 -0
  36. package/dist/codeq/resolvers/typescript.test.js +180 -0
  37. package/dist/codeq/resolvers/typescript_queries.js +78 -0
  38. package/dist/codeq/signature.js +50 -0
  39. package/dist/codeq/signature.test.js +50 -0
  40. package/dist/codeq/suggest.js +96 -0
  41. package/dist/codeq/treesitter.js +122 -0
  42. package/dist/codeq/treesitter.test.js +118 -0
  43. package/dist/config/config.js +74 -0
  44. package/dist/config/config.test.js +98 -0
  45. package/dist/config/fs.js +116 -0
  46. package/dist/config/glob.js +82 -0
  47. package/dist/config/glob.test.js +61 -0
  48. package/dist/config/index.js +4 -0
  49. package/dist/dedup/analyzer/analyzer.js +533 -0
  50. package/dist/dedup/analyzer/analyzer.test.js +530 -0
  51. package/dist/dedup/analyzer/canonical.js +74 -0
  52. package/dist/dedup/analyzer/canonical.test.js +70 -0
  53. package/dist/dedup/analyzer/cosine_clusters.js +169 -0
  54. package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
  55. package/dist/dedup/analyzer/distinctive.js +85 -0
  56. package/dist/dedup/analyzer/distinctive.test.js +49 -0
  57. package/dist/dedup/analyzer/exact_clusters.js +63 -0
  58. package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
  59. package/dist/dedup/analyzer/index.js +14 -0
  60. package/dist/dedup/analyzer/multiplicity.js +110 -0
  61. package/dist/dedup/analyzer/multiplicity.test.js +123 -0
  62. package/dist/dedup/analyzer/order.js +22 -0
  63. package/dist/dedup/analyzer/partial_overlaps.js +65 -0
  64. package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
  65. package/dist/dedup/analyzer/preview.js +84 -0
  66. package/dist/dedup/analyzer/preview.test.js +46 -0
  67. package/dist/dedup/analyzer/safety.js +27 -0
  68. package/dist/dedup/analyzer/safety.test.js +39 -0
  69. package/dist/dedup/config.js +18 -0
  70. package/dist/dedup/configload.js +299 -0
  71. package/dist/dedup/configload.test.js +410 -0
  72. package/dist/dedup/dedup.index.test.js +203 -0
  73. package/dist/dedup/dedup.js +143 -0
  74. package/dist/dedup/dedup.test.js +212 -0
  75. package/dist/dedup/dedupcfg/config.js +112 -0
  76. package/dist/dedup/dedupcfg/config.test.js +70 -0
  77. package/dist/dedup/dedupcfg/index.js +1 -0
  78. package/dist/dedup/deduptypes/index.js +1 -0
  79. package/dist/dedup/deduptypes/types.js +9 -0
  80. package/dist/dedup/deduptypes/types.test.js +34 -0
  81. package/dist/dedup/embedder/cache.js +23 -0
  82. package/dist/dedup/embedder/cache.test.js +50 -0
  83. package/dist/dedup/embedder/constants.js +10 -0
  84. package/dist/dedup/embedder/embedder.js +76 -0
  85. package/dist/dedup/embedder/embedder.mock.test.js +128 -0
  86. package/dist/dedup/embedder/embedder.test.js +96 -0
  87. package/dist/dedup/embedder/errors.js +20 -0
  88. package/dist/dedup/embedder/errors.test.js +35 -0
  89. package/dist/dedup/embedder/index.js +4 -0
  90. package/dist/dedup/embedder/session.js +78 -0
  91. package/dist/dedup/embedder/session.test.js +172 -0
  92. package/dist/dedup/gitignore.js +97 -0
  93. package/dist/dedup/gitignore.test.js +98 -0
  94. package/dist/dedup/index.js +11 -0
  95. package/dist/dedup/indexdb/errors.js +48 -0
  96. package/dist/dedup/indexdb/index.js +6 -0
  97. package/dist/dedup/indexdb/indexdb.js +302 -0
  98. package/dist/dedup/indexdb/indexdb.test.js +739 -0
  99. package/dist/dedup/indexdb/load.js +110 -0
  100. package/dist/dedup/indexdb/migrations.js +58 -0
  101. package/dist/dedup/indexdb/schema.js +83 -0
  102. package/dist/dedup/indexer/index.js +9 -0
  103. package/dist/dedup/indexer/indexer.js +501 -0
  104. package/dist/dedup/indexer/indexer.test.js +510 -0
  105. package/dist/dedup/indexer/links.js +89 -0
  106. package/dist/dedup/mdsection/anchor.js +60 -0
  107. package/dist/dedup/mdsection/anchor.test.js +39 -0
  108. package/dist/dedup/mdsection/blocks.js +409 -0
  109. package/dist/dedup/mdsection/blocks.test.js +359 -0
  110. package/dist/dedup/mdsection/index.js +4 -0
  111. package/dist/dedup/mdsection/parse.js +21 -0
  112. package/dist/dedup/mdsection/section.js +234 -0
  113. package/dist/dedup/mdsection/section.test.js +221 -0
  114. package/dist/dedup/report/floatfmt.js +71 -0
  115. package/dist/dedup/report/floatfmt.test.js +42 -0
  116. package/dist/dedup/report/index.js +8 -0
  117. package/dist/dedup/report/quote.js +77 -0
  118. package/dist/dedup/report/quote.test.js +67 -0
  119. package/dist/dedup/report/text.js +251 -0
  120. package/dist/dedup/report/text.test.js +420 -0
  121. package/dist/dedup/report_types.js +8 -0
  122. package/dist/dedup/sectionid/index.js +1 -0
  123. package/dist/dedup/sectionid/sectionid.js +16 -0
  124. package/dist/dedup/sectionid/sectionid.test.js +49 -0
  125. package/dist/guard/api/errors.js +12 -0
  126. package/dist/guard/api/index.js +2 -0
  127. package/dist/guard/api/parser.js +81 -0
  128. package/dist/guard/api/parser.test.js +58 -0
  129. package/dist/guard/api/types.js +1 -0
  130. package/dist/guard/code/errors.js +16 -0
  131. package/dist/guard/code/index.js +2 -0
  132. package/dist/guard/code/parser.js +54 -0
  133. package/dist/guard/code/parser.test.js +111 -0
  134. package/dist/guard/code/types.js +6 -0
  135. package/dist/index.js +1 -0
  136. package/dist/index.test.js +5 -0
  137. package/dist/repo/boundary.js +92 -0
  138. package/dist/repo/boundary.test.js +65 -0
  139. package/dist/repo/errors.js +56 -0
  140. package/dist/repo/errors.test.js +85 -0
  141. package/dist/repo/exists.test.js +72 -0
  142. package/dist/repo/filename.js +46 -0
  143. package/dist/repo/filename.test.js +39 -0
  144. package/dist/repo/fs.js +53 -0
  145. package/dist/repo/index.js +7 -0
  146. package/dist/repo/overlay.js +36 -0
  147. package/dist/repo/overlay.test.js +80 -0
  148. package/dist/repo/repo.js +353 -0
  149. package/dist/repo/repo.test.js +255 -0
  150. package/dist/repo/testutil.js +27 -0
  151. package/dist/repo/write.test.js +125 -0
  152. package/dist/report/color.js +73 -0
  153. package/dist/report/index.js +1 -0
  154. package/dist/report/report.js +112 -0
  155. package/dist/report/report.test.js +368 -0
  156. package/dist/violation/index.js +1 -0
  157. package/dist/violation/types.js +22 -0
  158. package/dist/violation/types.test.js +70 -0
  159. package/package.json +48 -0
@@ -0,0 +1,143 @@
1
+ // Top-level facade for the docgov dedup subsystem: the Index and Analyze entry
2
+ // points the CLI calls.
3
+ //
4
+ // Ported from internal/dedup/dedup.go. Index walks <repoRoot>/docs/, extracts
5
+ // eligible sections, embeds new/changed ones, and persists the index to
6
+ // .docgov/dedup/index.db. Analyze loads that index and runs the layered
7
+ // duplicate-detection algorithm, returning a structured Report.
8
+ //
9
+ // Reconciling the mixed sync/async stack vs Go:
10
+ // - ctx is dropped throughout (the ported embedder/indexdb already dropped it);
11
+ // - indexdb.open is synchronous and THROWS on failure (Go returned an error),
12
+ // so the Go `defer store.Close()` becomes a try/finally;
13
+ // - embedder.newEmbedder is async (model load); the Go `defer emb.Close()` and
14
+ // its real close are wrapped in try/finally;
15
+ // - the analyzer wants Map<string, number[]>; the indexdb loaders return
16
+ // Map<string, Float32Array>, so vectors are converted before the call.
17
+ import { existsSync, promises as fsp } from "node:fs";
18
+ import * as path from "node:path";
19
+ import { analyze } from "./analyzer/index.js";
20
+ import { Dimension, Embedder, Model } from "./embedder/index.js";
21
+ import { ErrIndexMissing, open } from "./indexdb/index.js";
22
+ import { run } from "./indexer/index.js";
23
+ import { Load } from "./configload.js";
24
+ import { ensureDedupGitignore } from "./gitignore.js";
25
+ /** dbPathOf returns the index DB path under repoRoot (Go inlines this twice). */
26
+ function dbPathOf(repoRoot) {
27
+ return path.join(repoRoot, ".docgov", "dedup", "index.db");
28
+ }
29
+ /**
30
+ * Index walks <repoRoot>/docs/, extracts eligible sections, embeds new and
31
+ * changed ones, and persists the index to .docgov/dedup/index.db.
32
+ *
33
+ * progress receives one-line status messages; pass a no-op for silence (Go's
34
+ * io.Discard) or one that writes to stderr from the CLI.
35
+ */
36
+ export async function Index(repoRoot, progress) {
37
+ let cfg;
38
+ try {
39
+ cfg = await Load(repoRoot);
40
+ }
41
+ catch (err) {
42
+ throw new Error(`dedup.Index: load config: ${String(err)}`, { cause: err });
43
+ }
44
+ const dbPath = dbPathOf(repoRoot);
45
+ try {
46
+ await fsp.mkdir(path.dirname(dbPath), { recursive: true });
47
+ }
48
+ catch (err) {
49
+ throw new Error(`dedup.Index: create db dir: ${String(err)}`, { cause: err });
50
+ }
51
+ try {
52
+ await ensureDedupGitignore(path.dirname(dbPath));
53
+ }
54
+ catch (err) {
55
+ throw new Error(`dedup.Index: write gitignore: ${String(err)}`, { cause: err });
56
+ }
57
+ let store;
58
+ try {
59
+ ({ store } = open(dbPath, Model, Dimension));
60
+ }
61
+ catch (err) {
62
+ throw new Error(`dedup.Index: open db: ${String(err)}`, { cause: err });
63
+ }
64
+ try {
65
+ let emb;
66
+ try {
67
+ emb = await Embedder.newEmbedder();
68
+ }
69
+ catch (err) {
70
+ throw new Error(`dedup.Index: new embedder: ${String(err)}`, { cause: err });
71
+ }
72
+ try {
73
+ let stats;
74
+ try {
75
+ stats = await run(store, emb, repoRoot, cfg, progress);
76
+ }
77
+ catch (err) {
78
+ throw new Error(`dedup.Index: run: ${String(err)}`, { cause: err });
79
+ }
80
+ return {
81
+ sections: stats.sections,
82
+ embedded: stats.embedded,
83
+ pruned: stats.pruned,
84
+ };
85
+ }
86
+ finally {
87
+ await emb.close();
88
+ }
89
+ }
90
+ finally {
91
+ store.close();
92
+ }
93
+ }
94
+ /**
95
+ * Analyze loads the index from .docgov/dedup/index.db, runs the layered
96
+ * duplicate-detection algorithm, and returns a structured Report.
97
+ *
98
+ * Throws ErrIndexMissing (wrapped) when the index DB does not exist.
99
+ */
100
+ export async function Analyze(repoRoot) {
101
+ const dbPath = dbPathOf(repoRoot);
102
+ if (!existsSync(dbPath)) {
103
+ // Throw the sentinel directly so callers match it with `instanceof`
104
+ // (the TS analogue of Go's errors.Is against indexdb.ErrIndexMissing).
105
+ throw new ErrIndexMissing("dedup.Analyze: indexdb: index database does not exist");
106
+ }
107
+ let cfg;
108
+ try {
109
+ cfg = await Load(repoRoot);
110
+ }
111
+ catch (err) {
112
+ throw new Error(`dedup.Analyze: load config: ${String(err)}`, { cause: err });
113
+ }
114
+ let store;
115
+ try {
116
+ ({ store } = open(dbPath, Model, Dimension));
117
+ }
118
+ catch (err) {
119
+ throw new Error(`dedup.Analyze: open db: ${String(err)}`, { cause: err });
120
+ }
121
+ try {
122
+ const { sections, embeddings } = store.loadAllSectionsWithEmbeddings();
123
+ const { blocks, embeddings: blockEmb } = store.loadAllBlocksWithEmbeddings();
124
+ return analyze(sections, toNumberMap(embeddings), blocks, toNumberMap(blockEmb), cfg);
125
+ }
126
+ finally {
127
+ store.close();
128
+ }
129
+ }
130
+ /**
131
+ * toNumberMap converts the Float32Array-valued embedding maps returned by the
132
+ * indexdb loaders into the number[]-valued maps the analyzer consumes. The
133
+ * analyzer reads vectors by index and length only, so this is a faithful,
134
+ * value-preserving conversion (Go used []float32 throughout; the TS analyzer
135
+ * settled on number[]).
136
+ */
137
+ function toNumberMap(m) {
138
+ const out = new Map();
139
+ for (const [k, v] of m) {
140
+ out.set(k, Array.from(v));
141
+ }
142
+ return out;
143
+ }
@@ -0,0 +1,212 @@
1
+ /**
2
+ * Behavior-encoding tests for the dedup facade (Index + Analyze), ported from
3
+ * internal/dedup/dedup_test.go and extended with a fake-embedder integration
4
+ * test driving the facade's Analyze over a really-indexed inline corpus.
5
+ *
6
+ * WHY each case matters:
7
+ * - Analyze on a missing index must throw ErrIndexMissing (wrapped), so the
8
+ * CLI can tell "run `dedup index` first" apart from a real failure;
9
+ * - Analyze on an empty index is a valid "no duplicates" state, NOT an error;
10
+ * - Analyze must surface L5 PartialOverlaps from blocks loaded out of the DB —
11
+ * the facade wiring (loading blocks + passing them to the analyzer) is the
12
+ * contract a regression to nil-blocks would silently break.
13
+ *
14
+ * The fake-embedder integration indexes two inline docs that share a verbatim
15
+ * prose block via the indexer (no model download), then asserts dedup.Analyze
16
+ * returns exactly the expected exact-overlap cluster — exercising the whole
17
+ * facade Analyze path on real persisted state.
18
+ *
19
+ * The real-embedder Index path is covered by an env-gated smoke test (the only
20
+ * allowed skip: it would download the ~1GB model).
21
+ */
22
+ import { mkdtempSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
23
+ import { tmpdir } from "node:os";
24
+ import { dirname, join } from "node:path";
25
+ import { DatabaseSync } from "node:sqlite";
26
+ import { afterEach, describe, expect, it } from "vitest";
27
+ import { Dimension, Model } from "./embedder/index.js";
28
+ import { ErrIndexMissing, open } from "./indexdb/index.js";
29
+ import { encodeVec, run } from "./indexer/index.js";
30
+ import { Default } from "./config.js";
31
+ import { Analyze, Index } from "./dedup.js";
32
+ // --- fake embedder ---------------------------------------------------------
33
+ /** unitVec produces a deterministic vector with a 1.0 at position (seed % dim). */
34
+ function unitVec(seed, dim) {
35
+ const v = new Array(dim).fill(0);
36
+ v[seed % dim] = 1.0;
37
+ return v;
38
+ }
39
+ /**
40
+ * FakeEmbedder returns deterministic orthogonal unit vectors per batch position
41
+ * so distinct sections never collapse into one HIGH group — the shared block is
42
+ * then the only duplication signal, surfacing via L5-exact (which is hash-based,
43
+ * not embedding-based). No model download.
44
+ */
45
+ class FakeEmbedder {
46
+ async embed(texts) {
47
+ return texts.map((_, i) => unitVec(i, Dimension));
48
+ }
49
+ name() {
50
+ return Model;
51
+ }
52
+ dimension() {
53
+ return Dimension;
54
+ }
55
+ }
56
+ // --- temp dir plumbing -----------------------------------------------------
57
+ const tmpDirs = [];
58
+ afterEach(() => {
59
+ for (const d of tmpDirs.splice(0)) {
60
+ rmSync(d, { recursive: true, force: true });
61
+ }
62
+ });
63
+ /** newRepoRoot returns a fresh empty temp repo root. */
64
+ function newRepoRoot() {
65
+ const dir = mkdtempSync(join(tmpdir(), "dedup-facade-"));
66
+ tmpDirs.push(dir);
67
+ return dir;
68
+ }
69
+ /** writeFiles writes relPath→content under repoRoot, creating parents. */
70
+ function writeFiles(repoRoot, files) {
71
+ for (const [rel, content] of Object.entries(files)) {
72
+ const full = join(repoRoot, rel);
73
+ mkdirSync(dirname(full), { recursive: true });
74
+ writeFileSync(full, content);
75
+ }
76
+ }
77
+ /** dbPathOf mirrors the facade's index DB location. */
78
+ function dbPathOf(repoRoot) {
79
+ return join(repoRoot, ".docgov", "dedup", "index.db");
80
+ }
81
+ /** createEmptyIndexDB opens (and immediately closes) a valid empty index DB. */
82
+ function createEmptyIndexDB(repoRoot) {
83
+ const dbPath = dbPathOf(repoRoot);
84
+ mkdirSync(dirname(dbPath), { recursive: true });
85
+ const { store } = open(dbPath, Model, Dimension);
86
+ store.close();
87
+ }
88
+ const noProgress = () => { };
89
+ describe("dedup.Analyze", () => {
90
+ // WHY: a missing index must be a distinct, matchable error so the CLI can
91
+ // prompt the user to index first rather than reporting a generic failure.
92
+ it("throws ErrIndexMissing when the index DB does not exist", async () => {
93
+ const repoRoot = newRepoRoot(); // empty, no .docgov/dedup/index.db
94
+ await expect(Analyze(repoRoot)).rejects.toBeInstanceOf(ErrIndexMissing);
95
+ });
96
+ // WHY: an existing but empty index is a valid "no duplicates" state — Analyze
97
+ // must return an empty Report, NOT an error and NOT ErrIndexMissing.
98
+ it("returns an empty Report for an empty index", async () => {
99
+ const repoRoot = newRepoRoot();
100
+ createEmptyIndexDB(repoRoot);
101
+ const report = await Analyze(repoRoot);
102
+ expect(report.HighGroups).toHaveLength(0);
103
+ expect(report.MaybePairs).toHaveLength(0);
104
+ expect(report.PartialOverlaps).toHaveLength(0);
105
+ });
106
+ // WHY: the facade must load blocks from the DB and feed them to the analyzer so
107
+ // a verbatim block shared across two files surfaces as exactly one exact
108
+ // PartialOverlaps cluster. This pins the block-loading wiring (Go's P7.T1 fix);
109
+ // a regression to nil-blocks would silently return zero clusters here.
110
+ // Seeds the DB directly (mirrors Go's TestAnalyze_EndToEnd_PartialOverlap) to
111
+ // assert the wiring independently of the markdown extractor.
112
+ it("returns one exact PartialOverlaps cluster for a shared verbatim block", async () => {
113
+ const repoRoot = newRepoRoot();
114
+ const dbPath = dbPathOf(repoRoot);
115
+ mkdirSync(dirname(dbPath), { recursive: true });
116
+ const { store } = open(dbPath, Model, Dimension);
117
+ // Orthogonal section embeddings so the two sections do NOT form a HIGH group
118
+ // (cosine 0); the block embedding is shared (verbatim-identical block).
119
+ const secAEmb = encodeVec(new Float32Array([1.0, 0.0, 0.0]));
120
+ const secBEmb = encodeVec(new Float32Array([0.0, 1.0, 0.0]));
121
+ const blockEmb = encodeVec(new Float32Array([0.5, 0.5, 0.0]));
122
+ const sharedHash = "aabbccddeeff00112233445566778899aabbccddeeff00112233445566778899";
123
+ store.execTx((db) => {
124
+ const insertSection = db.prepare(`INSERT OR REPLACE INTO sections
125
+ (id, file_path, heading, heading_level, anchor, start_line, end_line,
126
+ content_hash, raw_content, embed_text, prose_word_count, has_table,
127
+ has_code, inbound_count, embedding, updated_at)
128
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`);
129
+ insertSection.run("sec-a-intro", "docs/a.md", "Introduction", 2, "introduction", 1, 20, "hash-a", "## Introduction\n\nContent a.", "content a", 15, 0, 0, 0, secAEmb, "2024-01-01T00:00:00Z");
130
+ insertSection.run("sec-b-intro", "docs/b.md", "Introduction", 2, "introduction", 1, 20, "hash-b", "## Introduction\n\nContent b.", "content b", 15, 0, 0, 0, secBEmb, "2024-01-01T00:00:00Z");
131
+ const insertBlock = db.prepare(`INSERT OR REPLACE INTO blocks
132
+ (section_id, block_index, file_path, heading, kind,
133
+ start_line, end_line, content_hash, embedding)
134
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`);
135
+ insertBlock.run("sec-a-intro", 0, "docs/a.md", "Introduction", "prose", 3, 8, sharedHash, blockEmb);
136
+ insertBlock.run("sec-b-intro", 0, "docs/b.md", "Introduction", "prose", 3, 8, sharedHash, blockEmb);
137
+ });
138
+ store.close();
139
+ const report = await Analyze(repoRoot);
140
+ expect(report.PartialOverlaps).toHaveLength(1);
141
+ const cl = report.PartialOverlaps[0];
142
+ expect(cl.Exact).toBe(true);
143
+ expect(cl.Kind).toBe("prose");
144
+ expect(cl.Locations).toHaveLength(2);
145
+ });
146
+ });
147
+ describe("dedup facade end-to-end (fake embedder)", () => {
148
+ // WHY: this exercises the whole facade Analyze path over a really-indexed
149
+ // corpus — the indexer extracts/persists sections+blocks from inline docs, and
150
+ // Analyze must then surface the shared verbatim prose block as an exact
151
+ // overlap. The embedder is faked so no model is downloaded; L5-exact is
152
+ // hash-based so the fake vectors don't affect the cluster.
153
+ it("indexes two inline docs sharing a verbatim block and Analyze reports it", async () => {
154
+ const repoRoot = newRepoRoot();
155
+ // The shared paragraph is >10 words so the block is eligible. It is byte-
156
+ // identical in both files → identical content_hash → one exact cluster.
157
+ //
158
+ // Each section also carries its own distinct prose so the two SECTIONS are
159
+ // NOT a whole-section exact duplicate (which the analyzer would report as a
160
+ // HIGH group and then suppress the block-level overlap). Only the shared
161
+ // paragraph overlaps, so it must surface as an L5 PartialOverlaps cluster.
162
+ const sharedPara = "This shared paragraph appears verbatim in both documents and is long enough to be an eligible duplicate block for detection.";
163
+ const aIntro = "Alpha covers the ingestion side of the system, including the upload queue, the validation gateway, and the retry scheduler for failed batches.";
164
+ const bIntro = "Beta documents the reporting surface, covering scheduled exports, the metrics rollup job, and the long-term archival of historical aggregates.";
165
+ writeFiles(repoRoot, {
166
+ "docs/a.md": `## Alpha Overview\n\n${aIntro}\n\n${sharedPara}\n`,
167
+ "docs/b.md": `## Beta Overview\n\n${bIntro}\n\n${sharedPara}\n`,
168
+ });
169
+ // Index via the indexer with a fake embedder (the facade's Index would
170
+ // download the real model). This populates the same DB the facade reads.
171
+ const dbPath = dbPathOf(repoRoot);
172
+ mkdirSync(dirname(dbPath), { recursive: true });
173
+ const { store } = open(dbPath, Model, Dimension);
174
+ try {
175
+ const stats = await run(store, new FakeEmbedder(), repoRoot, Default(), noProgress);
176
+ expect(stats.sections).toBeGreaterThanOrEqual(2);
177
+ }
178
+ finally {
179
+ store.close();
180
+ }
181
+ const report = await Analyze(repoRoot);
182
+ // The shared verbatim block must surface as exactly one exact cluster across
183
+ // the two files.
184
+ expect(report.PartialOverlaps).toHaveLength(1);
185
+ const cl = report.PartialOverlaps[0];
186
+ expect(cl.Exact).toBe(true);
187
+ expect(new Set(cl.Locations.map((l) => l.FilePath))).toEqual(new Set(["docs/a.md", "docs/b.md"]));
188
+ });
189
+ });
190
+ describe("dedup.Index (real embedder)", () => {
191
+ // WHY: the Index path itself (real embedder, real walk, real persist) is only
192
+ // meaningful with the model. Gated behind DOCGOV_E2E so CI never downloads
193
+ // ~1GB — the single allowed env-skip per the porting conventions.
194
+ const runE2E = process.env["DOCGOV_E2E"] === "1";
195
+ it.skipIf(!runE2E)("indexes a corpus and persists sections", async () => {
196
+ const repoRoot = newRepoRoot();
197
+ writeFiles(repoRoot, {
198
+ "docs/guide.md": "## Introduction\n\nThis is the introduction section of the guide. It covers the basic concepts and provides an overview of the system.\n",
199
+ });
200
+ const stats = await Index(repoRoot, noProgress);
201
+ expect(stats.sections).toBeGreaterThan(0);
202
+ // The DB now exists; Analyze must succeed (not ErrIndexMissing).
203
+ const db = new DatabaseSync(dbPathOf(repoRoot));
204
+ try {
205
+ const row = db.prepare("SELECT COUNT(*) AS n FROM sections").get();
206
+ expect(row.n).toBeGreaterThan(0);
207
+ }
208
+ finally {
209
+ db.close();
210
+ }
211
+ });
212
+ });
@@ -0,0 +1,112 @@
1
+ // Package dedupcfg defines the Config types for the dedup subsystem.
2
+ // It is a leaf package (no imports from within the dedup tree) so that the
3
+ // indexer can import it without creating an import cycle with the top-level
4
+ // dedup facade.
5
+ //
6
+ // The YAML overlay (Default() merged with .docgov/dedup/config.yml) lives in
7
+ // the parent dedup package; this leaf only declares the shapes and defaults.
8
+ // Because that overlay deserializes raw YAML keys onto these shapes, every
9
+ // property name here keeps its EXACT YAML serialized spelling (snake_case),
10
+ // not the Go field's PascalCase identifier.
11
+ // headingBlacklisted reports whether heading matches any entry in
12
+ // heading_blacklist or heading_blacklist_extra (case-insensitive substring).
13
+ //
14
+ // It is the single matcher for the heading blacklist, shared by the
15
+ // canonical-selection path and the block-embedding gate so the two cannot
16
+ // drift apart. (Go: AnalyzerConfig.HeadingBlacklisted.)
17
+ export function headingBlacklisted(cfg, heading) {
18
+ const h = heading.toLowerCase();
19
+ for (const tok of cfg.heading_blacklist) {
20
+ if (h.includes(tok.toLowerCase())) {
21
+ return true;
22
+ }
23
+ }
24
+ for (const tok of cfg.heading_blacklist_extra ?? []) {
25
+ if (h.includes(tok.toLowerCase())) {
26
+ return true;
27
+ }
28
+ }
29
+ return false;
30
+ }
31
+ // defaultConfig returns the v1 locked default configuration.
32
+ // (Go: Default().) Named defaultConfig because `default` is a reserved word.
33
+ export function defaultConfig() {
34
+ return {
35
+ Markdown: {
36
+ min_prose_words: 10,
37
+ heading_token_min_len: 3,
38
+ hidden_dir_prefix: ".",
39
+ ignored_dirs: [
40
+ ".git",
41
+ "node_modules",
42
+ "vendor",
43
+ "dist",
44
+ "build",
45
+ ".next",
46
+ ".cache",
47
+ ".docgov",
48
+ "dedup-poc",
49
+ ".venv",
50
+ ],
51
+ },
52
+ Indexer: {
53
+ embed_progress_threshold: 100,
54
+ max_workers: 0,
55
+ external_url_prefixes: ["http://", "https://", "mailto:"],
56
+ },
57
+ Analyzer: {
58
+ thresh_high: 0.93,
59
+ thresh_maybe: 0.86,
60
+ distinctive_abs_min: 3,
61
+ distinctive_pct_of_headings: 0.03,
62
+ universal_stopwords: [
63
+ "the",
64
+ "a",
65
+ "an",
66
+ "of",
67
+ "and",
68
+ "or",
69
+ "to",
70
+ "with",
71
+ "for",
72
+ "in",
73
+ "on",
74
+ "is",
75
+ "are",
76
+ "be",
77
+ "by",
78
+ "from",
79
+ "as",
80
+ "at",
81
+ ],
82
+ differentiators: [
83
+ ["calendar days", "business days"],
84
+ ["sync", "async"],
85
+ ["create", "cancel"],
86
+ ["source", "target"],
87
+ ["inbound", "outbound"],
88
+ ["success", "failure"],
89
+ ["old flow", "new flow"],
90
+ ["deprecated", "current"],
91
+ ],
92
+ path_priority: ["docs/concepts/", "docs/architecture/", "docs/design/"],
93
+ heading_blacklist: ["related", "template rendering", "template sample"],
94
+ path_blacklist: ["changelog", "migration", "deprecated", "old", "legacy", "temporary"],
95
+ },
96
+ Report: {
97
+ preview_chars: 280,
98
+ preview_word_ratio: 0.6,
99
+ wrap_cols: 72,
100
+ separator: "---",
101
+ },
102
+ Embedder: {
103
+ batch_size: 32,
104
+ },
105
+ Block: {
106
+ min_words: 10,
107
+ table_min_rows: 2,
108
+ cosine_threshold: 0.95,
109
+ multiplicity_cap: 5,
110
+ },
111
+ };
112
+ }
@@ -0,0 +1,70 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { defaultConfig, headingBlacklisted } from "./index.js";
3
+ // The locked v1 BlockConfig defaults drive the L5 block-level clustering layer;
4
+ // the cosine_threshold and multiplicity_cap in particular gate which blocks are
5
+ // reported as duplicates. WHY: silently shifting any of these changes clustering
6
+ // behavior, so the defaults are pinned exactly.
7
+ describe("defaultConfig Block", () => {
8
+ it("returns the pinned v1 block thresholds", () => {
9
+ expect(defaultConfig().Block).toEqual({
10
+ min_words: 10,
11
+ table_min_rows: 2,
12
+ cosine_threshold: 0.95,
13
+ multiplicity_cap: 5,
14
+ });
15
+ });
16
+ });
17
+ // The clustering thresholds must keep their exact pinned values: thresh_maybe <
18
+ // thresh_high and both in (0,1) is the invariant the analyzer relies on.
19
+ // WHY: a drifted threshold would re-bucket near-duplicates and change which
20
+ // sections cluster together.
21
+ describe("defaultConfig Analyzer thresholds", () => {
22
+ it("pins thresh_high and thresh_maybe", () => {
23
+ const a = defaultConfig().Analyzer;
24
+ expect(a.thresh_high).toBe(0.93);
25
+ expect(a.thresh_maybe).toBe(0.86);
26
+ expect(a.thresh_maybe).toBeLessThan(a.thresh_high);
27
+ });
28
+ });
29
+ // headingBlacklisted is the single matcher shared by the canonical-selection
30
+ // path and the block-embedding gate. Matching is case-insensitive substring
31
+ // against heading_blacklist + heading_blacklist_extra.
32
+ //
33
+ // WHY: blacklisted headings must be recognised identically wherever the list is
34
+ // consulted, so a single matcher prevents the two callers from drifting apart.
35
+ describe("headingBlacklisted", () => {
36
+ const cfg = {
37
+ // Only the fields the matcher reads are populated; the rest are irrelevant
38
+ // to this behavior and omitted for focus.
39
+ thresh_high: 0.93,
40
+ thresh_maybe: 0.86,
41
+ distinctive_abs_min: 3,
42
+ distinctive_pct_of_headings: 0.03,
43
+ universal_stopwords: [],
44
+ differentiators: [],
45
+ path_priority: [],
46
+ heading_blacklist: ["related", "steps"],
47
+ heading_blacklist_extra: ["examples"],
48
+ path_blacklist: [],
49
+ };
50
+ it.each([
51
+ ["Steps", true], // exact, case-insensitive
52
+ ["Next Steps", true], // substring match
53
+ ["Related", true], // from heading_blacklist
54
+ ["Examples", true], // from heading_blacklist_extra
55
+ ["Order Lifecycle", false], // no match
56
+ ["", false], // empty heading
57
+ ])("headingBlacklisted(%j) === %j", (heading, want) => {
58
+ expect(headingBlacklisted(cfg, heading)).toBe(want);
59
+ });
60
+ // The default config has no heading_blacklist_extra; the matcher must treat
61
+ // the absent (undefined) list as empty rather than throwing. WHY: the YAML
62
+ // overlay only appends an extra list when present, so the common default path
63
+ // leaves it undefined.
64
+ it("treats absent heading_blacklist_extra as empty", () => {
65
+ const a = defaultConfig().Analyzer;
66
+ expect(a.heading_blacklist_extra).toBeUndefined();
67
+ expect(headingBlacklisted(a, "Related")).toBe(true);
68
+ expect(headingBlacklisted(a, "Order Lifecycle")).toBe(false);
69
+ });
70
+ });
@@ -0,0 +1 @@
1
+ export { defaultConfig, headingBlacklisted } from "./config.js";
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,9 @@
1
+ // Package deduptypes defines the pure-data types returned by the dedup
2
+ // analyzer. It is a leaf package (no imports from within the dedup subtree) so
3
+ // that the analyzer can import it without creating a cycle with the top-level
4
+ // dedup facade, which imports the analyzer.
5
+ //
6
+ // These are pure-data shapes (Go structs with no serialization tags): the
7
+ // field names below are the exact spellings used by indexdb when it persists
8
+ // and loads them, so they are kept verbatim.
9
+ export {};
@@ -0,0 +1,34 @@
1
+ import { describe, it, expect } from "vitest";
2
+ // Mirrors Go's TestCluster_Shape. The dedup analyzer produces these structs and
3
+ // indexdb persists/loads them by these exact field names, so the shape itself is
4
+ // the contract: a drift in a field name or type would silently break round-trip
5
+ // serialization. Constructing the value and reading every field back pins it.
6
+ describe("Cluster shape (indexdb persistence contract)", () => {
7
+ it("exposes every BlockLocation and Cluster field with the expected names and types", () => {
8
+ const loc = {
9
+ FilePath: "a.md",
10
+ Heading: "H",
11
+ StartLine: 1,
12
+ EndLine: 3,
13
+ };
14
+ const c = {
15
+ Kind: "prose",
16
+ ContentHash: "h",
17
+ Similarity: 0.97,
18
+ Exact: true,
19
+ Informational: false,
20
+ Locations: [loc],
21
+ };
22
+ expect(c.Kind).toBe("prose");
23
+ expect(c.ContentHash).toBe("h");
24
+ expect(c.Similarity).toBe(0.97);
25
+ expect(c.Exact).toBe(true);
26
+ expect(c.Informational).toBe(false);
27
+ expect(c.Locations).toHaveLength(1);
28
+ const l = c.Locations[0];
29
+ expect(l.FilePath).toBe("a.md");
30
+ expect(l.Heading).toBe("H");
31
+ expect(l.StartLine).toBe(1);
32
+ expect(l.EndLine).toBe(3);
33
+ });
34
+ });
@@ -0,0 +1,23 @@
1
+ import { homedir } from "node:os";
2
+ import { join } from "node:path";
3
+ import { Model } from "./constants.js";
4
+ /**
5
+ * cacheDir returns the resolved model cache directory in precedence order:
6
+ * 1. explicit (the WithCacheDir option equivalent)
7
+ * 2. DOCGOV_MODEL_CACHE env var
8
+ * 3. ~/.cache/docgov/models/<sanitized-model-name>/
9
+ *
10
+ * The returned path is not guaranteed to exist; the caller creates it.
11
+ */
12
+ export function cacheDir(explicit) {
13
+ if (explicit !== "") {
14
+ return explicit;
15
+ }
16
+ const env = process.env.DOCGOV_MODEL_CACHE;
17
+ if (env !== undefined && env !== "") {
18
+ return env;
19
+ }
20
+ // Sanitize the model name: replace / with _ for filesystem safety.
21
+ const sanitized = Model.replaceAll("/", "_");
22
+ return join(homedir(), ".cache", "docgov", "models", sanitized);
23
+ }
@@ -0,0 +1,50 @@
1
+ import { homedir } from "node:os";
2
+ import { join } from "node:path";
3
+ import { afterEach, beforeEach, expect, test } from "vitest";
4
+ import { cacheDir } from "./cache.js";
5
+ import { Model } from "./constants.js";
6
+ // Snapshot/restore DOCGOV_MODEL_CACHE so tests don't leak into each other or
7
+ // the host env.
8
+ let savedEnv;
9
+ beforeEach(() => {
10
+ savedEnv = process.env.DOCGOV_MODEL_CACHE;
11
+ delete process.env.DOCGOV_MODEL_CACHE;
12
+ });
13
+ afterEach(() => {
14
+ if (savedEnv === undefined) {
15
+ delete process.env.DOCGOV_MODEL_CACHE;
16
+ }
17
+ else {
18
+ process.env.DOCGOV_MODEL_CACHE = savedEnv;
19
+ }
20
+ });
21
+ // WHY: tests pass an explicit cache dir (a temp dir) so CI never races on the
22
+ // shared host cache. The explicit option MUST win over both the env var and
23
+ // the default, or that isolation guarantee breaks.
24
+ test("explicit cache dir wins over env var and default", () => {
25
+ process.env.DOCGOV_MODEL_CACHE = "/from/env";
26
+ expect(cacheDir("/explicit/dir")).toBe("/explicit/dir");
27
+ });
28
+ // WHY: DOCGOV_MODEL_CACHE lets operators relocate the multi-hundred-MB model
29
+ // off the home partition. It must be honored when no explicit dir is given.
30
+ test("DOCGOV_MODEL_CACHE is used when no explicit dir is given", () => {
31
+ process.env.DOCGOV_MODEL_CACHE = "/from/env";
32
+ expect(cacheDir("")).toBe("/from/env");
33
+ });
34
+ // WHY: an empty env var must NOT shadow the default (Go checks `env != ""`).
35
+ // A stray `DOCGOV_MODEL_CACHE=` in a shell would otherwise route the cache to
36
+ // the empty path.
37
+ test("empty DOCGOV_MODEL_CACHE falls through to the default", () => {
38
+ process.env.DOCGOV_MODEL_CACHE = "";
39
+ const want = join(homedir(), ".cache", "docgov", "models", Model.replaceAll("/", "_"));
40
+ expect(cacheDir("")).toBe(want);
41
+ });
42
+ // WHY: the default path sanitizes the model name by replacing "/" with "_".
43
+ // Leaving the slash in would create a nested dir under models/ and break the
44
+ // has-onnx-file lookup, re-triggering downloads every run.
45
+ test("default path sanitizes the model name slash to underscore", () => {
46
+ const got = cacheDir("");
47
+ expect(got).toContain("paraphrase-multilingual-mpnet-base-v2");
48
+ expect(got).not.toContain("sentence-transformers/");
49
+ expect(got).toContain("sentence-transformers_paraphrase-multilingual-mpnet-base-v2");
50
+ });