docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/README.md +242 -0
  2. package/dist/apispec/apispec.js +401 -0
  3. package/dist/apispec/apispec.test.js +444 -0
  4. package/dist/apispec/errors.js +17 -0
  5. package/dist/apispec/index.js +2 -0
  6. package/dist/check/doclinks.js +167 -0
  7. package/dist/check/index.js +8 -0
  8. package/dist/check/run.js +391 -0
  9. package/dist/check/run.test.js +513 -0
  10. package/dist/check/suggest.js +134 -0
  11. package/dist/check/suggest.test.js +92 -0
  12. package/dist/check/tokens.js +125 -0
  13. package/dist/cmd/main.js +330 -0
  14. package/dist/cmd/main.test.js +422 -0
  15. package/dist/codeq/cache.js +71 -0
  16. package/dist/codeq/cache.test.js +67 -0
  17. package/dist/codeq/errors.js +52 -0
  18. package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
  19. package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
  20. package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
  21. package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
  22. package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
  23. package/dist/codeq/index.js +11 -0
  24. package/dist/codeq/resolve.test.js +109 -0
  25. package/dist/codeq/resolver.js +128 -0
  26. package/dist/codeq/resolver.test.js +124 -0
  27. package/dist/codeq/resolvers/go.js +242 -0
  28. package/dist/codeq/resolvers/go.test.js +143 -0
  29. package/dist/codeq/resolvers/java.js +349 -0
  30. package/dist/codeq/resolvers/java.test.js +138 -0
  31. package/dist/codeq/resolvers/java_queries.js +63 -0
  32. package/dist/codeq/resolvers/javascript.js +412 -0
  33. package/dist/codeq/resolvers/javascript.test.js +125 -0
  34. package/dist/codeq/resolvers/javascript_queries.js +46 -0
  35. package/dist/codeq/resolvers/typescript.js +366 -0
  36. package/dist/codeq/resolvers/typescript.test.js +180 -0
  37. package/dist/codeq/resolvers/typescript_queries.js +78 -0
  38. package/dist/codeq/signature.js +50 -0
  39. package/dist/codeq/signature.test.js +50 -0
  40. package/dist/codeq/suggest.js +96 -0
  41. package/dist/codeq/treesitter.js +122 -0
  42. package/dist/codeq/treesitter.test.js +118 -0
  43. package/dist/config/config.js +74 -0
  44. package/dist/config/config.test.js +98 -0
  45. package/dist/config/fs.js +116 -0
  46. package/dist/config/glob.js +82 -0
  47. package/dist/config/glob.test.js +61 -0
  48. package/dist/config/index.js +4 -0
  49. package/dist/dedup/analyzer/analyzer.js +533 -0
  50. package/dist/dedup/analyzer/analyzer.test.js +530 -0
  51. package/dist/dedup/analyzer/canonical.js +74 -0
  52. package/dist/dedup/analyzer/canonical.test.js +70 -0
  53. package/dist/dedup/analyzer/cosine_clusters.js +169 -0
  54. package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
  55. package/dist/dedup/analyzer/distinctive.js +85 -0
  56. package/dist/dedup/analyzer/distinctive.test.js +49 -0
  57. package/dist/dedup/analyzer/exact_clusters.js +63 -0
  58. package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
  59. package/dist/dedup/analyzer/index.js +14 -0
  60. package/dist/dedup/analyzer/multiplicity.js +110 -0
  61. package/dist/dedup/analyzer/multiplicity.test.js +123 -0
  62. package/dist/dedup/analyzer/order.js +22 -0
  63. package/dist/dedup/analyzer/partial_overlaps.js +65 -0
  64. package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
  65. package/dist/dedup/analyzer/preview.js +84 -0
  66. package/dist/dedup/analyzer/preview.test.js +46 -0
  67. package/dist/dedup/analyzer/safety.js +27 -0
  68. package/dist/dedup/analyzer/safety.test.js +39 -0
  69. package/dist/dedup/config.js +18 -0
  70. package/dist/dedup/configload.js +299 -0
  71. package/dist/dedup/configload.test.js +410 -0
  72. package/dist/dedup/dedup.index.test.js +203 -0
  73. package/dist/dedup/dedup.js +143 -0
  74. package/dist/dedup/dedup.test.js +212 -0
  75. package/dist/dedup/dedupcfg/config.js +112 -0
  76. package/dist/dedup/dedupcfg/config.test.js +70 -0
  77. package/dist/dedup/dedupcfg/index.js +1 -0
  78. package/dist/dedup/deduptypes/index.js +1 -0
  79. package/dist/dedup/deduptypes/types.js +9 -0
  80. package/dist/dedup/deduptypes/types.test.js +34 -0
  81. package/dist/dedup/embedder/cache.js +23 -0
  82. package/dist/dedup/embedder/cache.test.js +50 -0
  83. package/dist/dedup/embedder/constants.js +10 -0
  84. package/dist/dedup/embedder/embedder.js +76 -0
  85. package/dist/dedup/embedder/embedder.mock.test.js +128 -0
  86. package/dist/dedup/embedder/embedder.test.js +96 -0
  87. package/dist/dedup/embedder/errors.js +20 -0
  88. package/dist/dedup/embedder/errors.test.js +35 -0
  89. package/dist/dedup/embedder/index.js +4 -0
  90. package/dist/dedup/embedder/session.js +78 -0
  91. package/dist/dedup/embedder/session.test.js +172 -0
  92. package/dist/dedup/gitignore.js +97 -0
  93. package/dist/dedup/gitignore.test.js +98 -0
  94. package/dist/dedup/index.js +11 -0
  95. package/dist/dedup/indexdb/errors.js +48 -0
  96. package/dist/dedup/indexdb/index.js +6 -0
  97. package/dist/dedup/indexdb/indexdb.js +302 -0
  98. package/dist/dedup/indexdb/indexdb.test.js +739 -0
  99. package/dist/dedup/indexdb/load.js +110 -0
  100. package/dist/dedup/indexdb/migrations.js +58 -0
  101. package/dist/dedup/indexdb/schema.js +83 -0
  102. package/dist/dedup/indexer/index.js +9 -0
  103. package/dist/dedup/indexer/indexer.js +501 -0
  104. package/dist/dedup/indexer/indexer.test.js +510 -0
  105. package/dist/dedup/indexer/links.js +89 -0
  106. package/dist/dedup/mdsection/anchor.js +60 -0
  107. package/dist/dedup/mdsection/anchor.test.js +39 -0
  108. package/dist/dedup/mdsection/blocks.js +409 -0
  109. package/dist/dedup/mdsection/blocks.test.js +359 -0
  110. package/dist/dedup/mdsection/index.js +4 -0
  111. package/dist/dedup/mdsection/parse.js +21 -0
  112. package/dist/dedup/mdsection/section.js +234 -0
  113. package/dist/dedup/mdsection/section.test.js +221 -0
  114. package/dist/dedup/report/floatfmt.js +71 -0
  115. package/dist/dedup/report/floatfmt.test.js +42 -0
  116. package/dist/dedup/report/index.js +8 -0
  117. package/dist/dedup/report/quote.js +77 -0
  118. package/dist/dedup/report/quote.test.js +67 -0
  119. package/dist/dedup/report/text.js +251 -0
  120. package/dist/dedup/report/text.test.js +420 -0
  121. package/dist/dedup/report_types.js +8 -0
  122. package/dist/dedup/sectionid/index.js +1 -0
  123. package/dist/dedup/sectionid/sectionid.js +16 -0
  124. package/dist/dedup/sectionid/sectionid.test.js +49 -0
  125. package/dist/guard/api/errors.js +12 -0
  126. package/dist/guard/api/index.js +2 -0
  127. package/dist/guard/api/parser.js +81 -0
  128. package/dist/guard/api/parser.test.js +58 -0
  129. package/dist/guard/api/types.js +1 -0
  130. package/dist/guard/code/errors.js +16 -0
  131. package/dist/guard/code/index.js +2 -0
  132. package/dist/guard/code/parser.js +54 -0
  133. package/dist/guard/code/parser.test.js +111 -0
  134. package/dist/guard/code/types.js +6 -0
  135. package/dist/index.js +1 -0
  136. package/dist/index.test.js +5 -0
  137. package/dist/repo/boundary.js +92 -0
  138. package/dist/repo/boundary.test.js +65 -0
  139. package/dist/repo/errors.js +56 -0
  140. package/dist/repo/errors.test.js +85 -0
  141. package/dist/repo/exists.test.js +72 -0
  142. package/dist/repo/filename.js +46 -0
  143. package/dist/repo/filename.test.js +39 -0
  144. package/dist/repo/fs.js +53 -0
  145. package/dist/repo/index.js +7 -0
  146. package/dist/repo/overlay.js +36 -0
  147. package/dist/repo/overlay.test.js +80 -0
  148. package/dist/repo/repo.js +353 -0
  149. package/dist/repo/repo.test.js +255 -0
  150. package/dist/repo/testutil.js +27 -0
  151. package/dist/repo/write.test.js +125 -0
  152. package/dist/report/color.js +73 -0
  153. package/dist/report/index.js +1 -0
  154. package/dist/report/report.js +112 -0
  155. package/dist/report/report.test.js +368 -0
  156. package/dist/violation/index.js +1 -0
  157. package/dist/violation/types.js +22 -0
  158. package/dist/violation/types.test.js +70 -0
  159. package/package.json +48 -0
@@ -0,0 +1,530 @@
1
+ // Ported from the TestAnalyze_* cases in internal/dedup/analyzer/analyzer_test.go.
2
+ //
3
+ // These are the end-to-end spec for the layered pipeline: they pin which layer
4
+ // fires (L1 exact / L2 promotion / L3 cosine HIGH-vs-MAYBE / L4 differentiator /
5
+ // L5 blocks), how confidence and recommended action are derived, when groups and
6
+ // pairs are dropped, and that block-level overlaps are wired through. The
7
+ // per-primitive behavior (clusters, canonical rank, preview, distinctive tokens,
8
+ // differentiators, multiplicity, suppression) is covered by the sibling
9
+ // *.test.ts files; this file exercises the orchestrator that composes them.
10
+ import { describe, it, expect } from "vitest";
11
+ import { defaultConfig } from "../dedupcfg/index.js";
12
+ import { analyze } from "./analyzer.js";
13
+ const cfg = defaultConfig();
14
+ // makeSection builds a minimal Section (TS snake_case shape) mirroring the Go
15
+ // makeSection helper: StartLine=1, EndLine=10, prose_word_count=15.
16
+ function makeSection(id, filePath, heading, level, anchor, contentHash, rawContent, inbound) {
17
+ return {
18
+ id,
19
+ file_path: filePath,
20
+ heading,
21
+ heading_level: level,
22
+ anchor,
23
+ start_line: 1,
24
+ end_line: 10,
25
+ content_hash: contentHash,
26
+ raw_content: rawContent,
27
+ embed_text: heading,
28
+ prose_word_count: 15,
29
+ has_table: false,
30
+ has_code: false,
31
+ inbound_count: inbound,
32
+ };
33
+ }
34
+ // makeUnitVec2D returns the 2D unit vector in the direction (x, y); the analyzer
35
+ // treats embeddings as L2-normalized so the dot product equals cosine.
36
+ function makeUnitVec2D(x, y) {
37
+ const norm = Math.sqrt(x * x + y * y);
38
+ if (norm === 0) {
39
+ return [0, 0];
40
+ }
41
+ return [x / norm, y / norm];
42
+ }
43
+ // emb builds the section-id -> vector map the analyzer consumes.
44
+ function emb(entries) {
45
+ return new Map(Object.entries(entries));
46
+ }
47
+ // L1 must union sections that share a content_hash (cross-file) into a HIGH
48
+ // group and mark the duplicate ExactMatch, with no MAYBE pair — exact identity
49
+ // is the strongest signal and short-circuits cosine.
50
+ describe("analyze L1 exact grouping", () => {
51
+ it("groups equal-hash cross-file sections as HIGH with ExactMatch and no MAYBE pairs", () => {
52
+ const hash = "abcdef1234567890";
53
+ const raw = "## Overview\nSome content here for testing.";
54
+ const sA = makeSection("id1", "docs/a.md", "Overview", 2, "overview", hash, raw, 0);
55
+ const sB = makeSection("id2", "docs/b.md", "Overview", 2, "overview", hash, raw, 0);
56
+ const v = makeUnitVec2D(1, 0);
57
+ const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
58
+ expect(report.HighGroups.length).toBeGreaterThan(0);
59
+ expect(report.MaybePairs).toHaveLength(0);
60
+ const grp = report.HighGroups[0];
61
+ expect(grp.Members.length).toBeGreaterThanOrEqual(1);
62
+ // L1 sets ExactMatch on the duplicate member.
63
+ expect(grp.Members.some((m) => m.ExactMatch)).toBe(true);
64
+ });
65
+ it("groups same-file equal-hash sections too (L1 does not skip same-file)", () => {
66
+ const hash = "samehash1234567890";
67
+ const sA = makeSection("id1", "docs/a.md", "Overview", 2, "overview", hash, "## Overview\nSome test content.", 0);
68
+ const sB = makeSection("id2", "docs/a.md", "Summary", 3, "summary", hash, "## Summary\nSome test content.", 0);
69
+ const v = makeUnitVec2D(1, 0);
70
+ const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
71
+ expect(report.HighGroups.length).toBeGreaterThan(0);
72
+ });
73
+ });
74
+ // L3 turns a high-cosine cross-file pair into a HIGH group, but L3 SKIPS
75
+ // same-file pairs entirely (neither HIGH nor MAYBE), and a mid-band cosine
76
+ // without a shared distinctive heading token stays a MAYBE pair (never unioned).
77
+ describe("analyze L3 cosine grouping", () => {
78
+ it("groups a cross-file cosine>=thresh_high pair as HIGH", () => {
79
+ const raw = "## Order Lifecycle\nThe order transitions through states.";
80
+ const sA = makeSection("id1", "docs/a.md", "Order Lifecycle", 2, "order-lifecycle", "hash1", raw, 0);
81
+ const sB = makeSection("id2", "docs/b.md", "Order Lifecycle", 2, "order-lifecycle", "hash2", raw, 0);
82
+ const v = makeUnitVec2D(0.6, 0.8); // identical directions -> cosine 1.0
83
+ const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
84
+ expect(report.HighGroups.length).toBeGreaterThan(0);
85
+ });
86
+ it("skips a same-file pair entirely (no HIGH group, no MAYBE pair)", () => {
87
+ const sA = makeSection("id1", "docs/a.md", "Order Lifecycle", 2, "order-lifecycle", "hash1", "## Order Lifecycle\nThe order transitions through states.", 0);
88
+ const sB = makeSection("id2", "docs/a.md", "Order States", 2, "order-states", "hash2", "## Order States\nThe order state machine details.", 0);
89
+ const v = makeUnitVec2D(0.6, 0.8);
90
+ const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
91
+ expect(report.HighGroups).toHaveLength(0);
92
+ expect(report.MaybePairs).toHaveLength(0);
93
+ });
94
+ it("reports a mid-band cosine without shared distinctive token as a MAYBE pair, not a group", () => {
95
+ // Headings "Uploading Documents" / "Downloading Reports" share no distinctive
96
+ // token, so no L2 promotion; cosine ~0.90 sits in [thresh_maybe, thresh_high).
97
+ const sA = makeSection("id1", "docs/a.md", "Uploading Documents", 2, "uploading-documents", "hash1", "## Uploading Documents\nContent about uploading various files.", 0);
98
+ const sB = makeSection("id2", "docs/b.md", "Downloading Reports", 2, "downloading-reports", "hash2", "## Downloading Reports\nContent about downloading various files.", 0);
99
+ const theta = 0.451; // arccos(0.90)
100
+ const v1 = [1.0, 0.0];
101
+ const v2 = [Math.cos(theta), Math.sin(theta)];
102
+ const report = analyze([sA, sB], emb({ id1: v1, id2: v2 }), null, null, cfg);
103
+ expect(report.MaybePairs.length).toBeGreaterThan(0);
104
+ expect(report.HighGroups).toHaveLength(0);
105
+ });
106
+ });
107
+ // L4 differentiators NEVER demote the confidence tier — they only force the
108
+ // recommended action to manual_review. A cosine=1.0 sync/async pair must stay
109
+ // confidence="high" but action="manual_review".
110
+ describe("analyze L4 differentiator", () => {
111
+ it("keeps confidence high but forces action=manual_review when a differentiator is present", () => {
112
+ const sA = makeSection("id1", "docs/a.md", "Sync Processing", 2, "sync-processing", "hash1", "## Sync Processing\nThis section covers sync operations in detail.", 0);
113
+ const sB = makeSection("id2", "docs/b.md", "Async Processing", 2, "async-processing", "hash2", "## Async Processing\nThis section covers async operations in detail.", 0);
114
+ const v = makeUnitVec2D(0.6, 0.8);
115
+ const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
116
+ expect(report.HighGroups.length).toBeGreaterThan(0);
117
+ const grp = report.HighGroups[0];
118
+ expect(grp.Confidence).toBe("high");
119
+ expect(grp.Action).toBe("manual_review");
120
+ });
121
+ });
122
+ // A group whose every member is disqualified (here both headings are the
123
+ // blacklisted "Related") carries no actionable canonical, so it is dropped from
124
+ // the report entirely.
125
+ describe("analyze disqualified group filtering", () => {
126
+ it("drops a group where every member is disqualified", () => {
127
+ const hash = "disqhash12345678";
128
+ const raw = "## Related\nSee also other sections.";
129
+ const sA = makeSection("id1", "docs/a.md", "Related", 2, "related", hash, raw, 0);
130
+ const sB = makeSection("id2", "docs/b.md", "Related", 2, "related", hash, raw, 0);
131
+ const v = makeUnitVec2D(1, 0);
132
+ const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
133
+ expect(report.HighGroups).toHaveLength(0);
134
+ });
135
+ });
136
+ // Confidence/action mapping: an all-exact (or all-HIGH) group with no
137
+ // differentiator yields confidence="high" and action="replace_with_reference".
138
+ describe("analyze confidence mapping", () => {
139
+ it("maps an all-exact group to high + replace_with_reference", () => {
140
+ const hash = "highconfhash1234";
141
+ const raw = "## Lifecycle\nLong content here for canonical test.";
142
+ const sA = makeSection("id1", "docs/concepts/a.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
143
+ const sB = makeSection("id2", "docs/design/b.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
144
+ const v = makeUnitVec2D(1, 0);
145
+ const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
146
+ expect(report.HighGroups.length).toBeGreaterThan(0);
147
+ const grp = report.HighGroups[0];
148
+ expect(grp.Confidence).toBe("high");
149
+ expect(grp.Action).toBe("replace_with_reference");
150
+ });
151
+ });
152
+ // Member previews are pre-computed by the orchestrator (so the renderer never
153
+ // re-derives them): the canonical's Preview is non-empty and has the heading
154
+ // line stripped.
155
+ describe("analyze member preview", () => {
156
+ it("pre-computes the canonical member preview with the heading line stripped", () => {
157
+ const hash = "prevhash12345678";
158
+ const raw = "## Lifecycle\nThis is the detailed content of this lifecycle section.";
159
+ const sA = makeSection("id1", "docs/a.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
160
+ const sB = makeSection("id2", "docs/b.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
161
+ const v = makeUnitVec2D(1, 0);
162
+ const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
163
+ expect(report.HighGroups.length).toBeGreaterThan(0);
164
+ const grp = report.HighGroups[0];
165
+ expect(grp.Canonical.Preview).not.toBe("");
166
+ expect(grp.Canonical.Preview).not.toContain("## Lifecycle");
167
+ });
168
+ });
169
+ // L5 wiring: a verbatim cross-file block surfaces as ONE exact PartialOverlap
170
+ // cluster and is NOT re-reported as a cosine cluster (exact-pass excludeHashes
171
+ // suppresses the duplicate). Passing nil blocks yields an empty PartialOverlaps
172
+ // without disturbing the L1-L4 output.
173
+ describe("analyze L5 partial overlaps", () => {
174
+ it("populates PartialOverlaps from a verbatim block and dedups exact-vs-cosine", () => {
175
+ const sA = makeSection("secA", "docs/a.md", "Overview", 2, "overview", "secHash12345678", "## Overview\nSome unique content here.", 0);
176
+ const sB = makeSection("secB", "docs/b.md", "Details", 2, "details", "differentHash1", "## Details\nCompletely different content.", 0);
177
+ const v = makeUnitVec2D(1, 0);
178
+ const vB = makeUnitVec2D(0, 1); // orthogonal -> no L3 match
179
+ const sectionEmb = emb({ secA: v, secB: vB });
180
+ const verbatimHash = "verbatim-block-hash-xyz";
181
+ const blocks = [
182
+ mkBlock("secA", "docs/a.md", "Overview", "prose", 3, 6, verbatimHash),
183
+ mkBlock("secB", "docs/b.md", "Details", "prose", 4, 7, verbatimHash),
184
+ ];
185
+ // verbatimHash also has a block embedding: it would be a cosine candidate
186
+ // were it not added to excludeHashes by the exact pass.
187
+ const blockEmb = emb({ [verbatimHash]: makeUnitVec2D(0.5, 0.5) });
188
+ const rep = analyze([sA, sB], sectionEmb, blocks, blockEmb, cfg);
189
+ expect(rep.PartialOverlaps).toHaveLength(1);
190
+ const cl = rep.PartialOverlaps[0];
191
+ expect(cl.Exact).toBe(true);
192
+ expect(cl.Kind).toBe("prose");
193
+ expect(cl.ContentHash).toBe(verbatimHash);
194
+ expect(cl.Locations).toHaveLength(2);
195
+ // The verbatim block appears exactly once as exact, never as cosine.
196
+ const exactCount = rep.PartialOverlaps.filter((c) => c.ContentHash === verbatimHash && c.Exact).length;
197
+ const cosineCount = rep.PartialOverlaps.filter((c) => c.ContentHash === verbatimHash && !c.Exact).length;
198
+ expect(exactCount).toBe(1);
199
+ expect(cosineCount).toBe(0);
200
+ });
201
+ it("yields empty PartialOverlaps for nil blocks without disturbing L1-L4 output", () => {
202
+ const hash = "noBlockHash12345";
203
+ const raw = "## Lifecycle\nContent for lifecycle section testing.";
204
+ const sA = makeSection("id1", "docs/a.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
205
+ const sB = makeSection("id2", "docs/b.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
206
+ const v = makeUnitVec2D(1, 0);
207
+ const rep = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
208
+ expect(rep.PartialOverlaps).toHaveLength(0);
209
+ expect(rep.HighGroups.length).toBeGreaterThan(0);
210
+ expect(rep.MaybePairs).toHaveLength(0);
211
+ });
212
+ });
213
+ // Fewer than two sections cannot form a pair or group, so Analyze must
214
+ // short-circuit to a fully-empty report before doing any embedding work. Pinned
215
+ // because the rest of the pipeline assumes n>=2 (pair loops, union-find).
216
+ describe("analyze trivial input", () => {
217
+ // WHY: a single section can never duplicate anything; returning early avoids
218
+ // wasted work and guarantees an empty, well-formed report.
219
+ it("returns an empty report for fewer than two sections", () => {
220
+ const s = makeSection("only", "docs/a.md", "Solo", 2, "solo", "h", "## Solo\nx", 0);
221
+ const rep = analyze([s], emb({ only: makeUnitVec2D(1, 0) }), null, null, cfg);
222
+ expect(rep.HighGroups).toHaveLength(0);
223
+ expect(rep.MaybePairs).toHaveLength(0);
224
+ expect(rep.PartialOverlaps).toHaveLength(0);
225
+ });
226
+ // WHY: zero sections is the degenerate boundary; it must not throw and must
227
+ // yield an empty report (same n<2 guard).
228
+ it("returns an empty report for zero sections", () => {
229
+ const rep = analyze([], new Map(), null, null, cfg);
230
+ expect(rep.HighGroups).toHaveLength(0);
231
+ expect(rep.MaybePairs).toHaveLength(0);
232
+ expect(rep.PartialOverlaps).toHaveLength(0);
233
+ });
234
+ });
235
+ // L2 promotion is the layer that rescues a mid-band cosine pair from MAYBE to
236
+ // HIGH when the headings share a distinctive token. Without it, near-duplicate
237
+ // sections with telltale shared headings would be under-reported as mere pairs.
238
+ describe("analyze L2 heading promotion", () => {
239
+ // WHY: a [thresh_maybe, thresh_high) cosine WITH a shared distinctive heading
240
+ // token must be PROMOTED to a HIGH group (not left a MAYBE pair). This pins the
241
+ // promotion branch that the heading signal triggers.
242
+ it("promotes a mid-band cosine pair with a shared distinctive heading token to HIGH", () => {
243
+ // Three shared distinctive tokens (zebra, payment, apple) so the reason
244
+ // builder must SORT them lexicographically (apple, payment, zebra),
245
+ // exercising both directions of the string comparator.
246
+ const sA = makeSection("id1", "docs/a.md", "Zebra Payment Apple", 2, "zebra-payment-apple", "h1", "## Zebra Payment Apple\nReconcile incoming payments daily here.", 0);
247
+ const sB = makeSection("id2", "docs/b.md", "Zebra Payment Apple", 2, "zebra-payment-apple", "h2", "## Zebra Payment Apple\nReconcile the incoming payments daily here.", 0);
248
+ // cosine ~0.90 sits inside [0.86, 0.93); the shared distinctive tokens
249
+ // (freq 2 <= distinctive threshold 3) drive the promotion.
250
+ const theta = Math.acos(0.9);
251
+ const v1 = [1, 0];
252
+ const v2 = [Math.cos(theta), Math.sin(theta)];
253
+ const rep = analyze([sA, sB], emb({ id1: v1, id2: v2 }), null, null, cfg);
254
+ expect(rep.HighGroups.length).toBeGreaterThan(0);
255
+ expect(rep.MaybePairs).toHaveLength(0);
256
+ // The HIGH reason set records the shared distinctive heading tokens, sorted.
257
+ const grp = rep.HighGroups[0];
258
+ const allReasons = grp.Members.flatMap((m) => m.Reasons).join(" ");
259
+ expect(allReasons).toContain("shared distinctive heading tokens: apple, payment, zebra");
260
+ });
261
+ });
262
+ // A custom heading_token_min_len of 0 forces the analyzer to fall back to the
263
+ // default minimum (3). Pinned because the fallback guards against a misconfigured
264
+ // length that would otherwise admit one- and two-letter heading tokens.
265
+ describe("analyze min-token-len fallback", () => {
266
+ // WHY: heading_token_min_len <= 0 must be coerced to 3, so the distinctive
267
+ // filter behaves identically to the default. We assert grouping still works,
268
+ // exercising the <=0 fallback path.
269
+ it("coerces a non-positive heading_token_min_len to the default", () => {
270
+ const custom = defaultConfig();
271
+ custom.Markdown.heading_token_min_len = 0;
272
+ const hash = "minlenhash123456";
273
+ const raw = "## Reconciliation\nSome content for the fallback test here.";
274
+ const sA = makeSection("id1", "docs/a.md", "Reconciliation", 2, "reconciliation", hash, raw, 0);
275
+ const sB = makeSection("id2", "docs/b.md", "Reconciliation", 2, "reconciliation", hash, raw, 0);
276
+ const v = makeUnitVec2D(1, 0);
277
+ const rep = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, custom);
278
+ expect(rep.HighGroups.length).toBeGreaterThan(0);
279
+ });
280
+ });
281
+ // L1 must union three sections that share a content_hash into ONE group; the
282
+ // third union sees an already-merged root, exercising the no-op union branch.
283
+ describe("analyze L1 three-way exact", () => {
284
+ // WHY: three equal-hash cross-file sections collapse into a single HIGH group,
285
+ // not three pairwise groups. This pins union-find idempotence (the third
286
+ // union(ra===rb) is a no-op) and the all-exact -> high confidence mapping.
287
+ it("collapses three equal-hash sections into one HIGH group", () => {
288
+ const hash = "triplehash123456";
289
+ const raw = "## Shared\nThe very same content repeated across three docs.";
290
+ const sA = makeSection("id1", "docs/a.md", "Shared", 2, "shared", hash, raw, 0);
291
+ const sB = makeSection("id2", "docs/b.md", "Shared", 2, "shared", hash, raw, 0);
292
+ const sC = makeSection("id3", "docs/c.md", "Shared", 2, "shared", hash, raw, 0);
293
+ const v = makeUnitVec2D(1, 0);
294
+ const rep = analyze([sA, sB, sC], emb({ id1: v, id2: v.slice(), id3: v.slice() }), null, null, cfg);
295
+ expect(rep.HighGroups).toHaveLength(1);
296
+ // canonical + 2 duplicate members = 3 sections in one group.
297
+ expect(rep.HighGroups[0].Members).toHaveLength(2);
298
+ expect(rep.HighGroups[0].Confidence).toBe("high");
299
+ });
300
+ });
301
+ // A transitive HIGH group is built when A~B and B~C cross the high bar but A~C is
302
+ // below thresh_maybe (no recorded pair). The canonical's link to the far member
303
+ // must be reconstructed via the best transitive pair — this is the contract that
304
+ // keeps a distant group member's similarity/reason non-empty.
305
+ describe("analyze transitive group", () => {
306
+ // WHY: when the canonical has no DIRECT recorded pair with a transitively
307
+ // grouped member, buildGroup must fall back to the best pair among earlier
308
+ // members (findBestPairInfo) rather than emitting a 0-similarity bare member.
309
+ // A regression that dropped this fallback would mislabel real duplicates.
310
+ it("reconstructs a transitive member's similarity from the best intermediate pair", () => {
311
+ // A is the best canonical (docs/concepts/ has top path priority). B ranks
312
+ // before C (longer raw_content -> better NegLen) so sorted = [A, B, C].
313
+ // A·B = 0.95 (HIGH, unioned). B·C = 0.94 (HIGH, unioned). A·C = 0.80
314
+ // (< thresh_maybe, NOT recorded) -> C is only transitively in the group, with
315
+ // no direct (A,C) pair. Headings share no token (no L2 promotion noise).
316
+ const sA = makeSection("idA", "docs/concepts/a.md", "Alpha", 2, "alpha", "hA", "## Alpha\nAlpha content here.", 0);
317
+ const sB = makeSection("idB", "docs/guides/b.md", "Bravo", 2, "bravo", "hB", "## Bravo\nBravo content here padded longer to win NegLen tiebreak.", 0);
318
+ const sC = makeSection("idC", "docs/guides/c.md", "Charlie", 2, "charlie", "hC", "## Charlie\nShort.", 0);
319
+ const vA = [1, 0, 0];
320
+ const vB = [0.95, Math.sqrt(1 - 0.95 * 0.95), 0];
321
+ const vC = [0.8, 0.5765, 0.166]; // |vC|~=1; A·C=0.80, B·C~=0.94
322
+ const rep = analyze([sA, sB, sC], emb({ idA: vA, idB: vB, idC: vC }), null, null, cfg);
323
+ expect(rep.HighGroups).toHaveLength(1);
324
+ const grp = rep.HighGroups[0];
325
+ // A wins canonical via path priority.
326
+ expect(grp.Canonical.SectionID).toBe("idA");
327
+ expect(grp.Members).toHaveLength(2);
328
+ // The transitive member (idC) has no direct (A,C) pair; buildGroup falls back
329
+ // to the best pair among earlier members and finds (B,C), so idC carries that
330
+ // pair's similarity (not the 0.0 placeholder).
331
+ const cMember = grp.Members.find((m) => m.SectionID === "idC");
332
+ expect(cMember.Similarity).toBeGreaterThan(0.9);
333
+ });
334
+ // WHY: when ALL three pairwise links are recorded and one is a plain MAYBE
335
+ // (mid-band, no shared token) yet the trio is still grouped via promotion/high,
336
+ // the group confidence must drop to "medium" (its weakest internal pair is not
337
+ // high/promoted). This pins the medium-tier mapping the renderer keys on.
338
+ it("downgrades a group with a non-high internal pair to medium confidence", () => {
339
+ // A·B = 0.94 (HIGH). B·C = 0.90 + shared "settlement" -> promoted (HIGH).
340
+ // A·C = 0.90, NO shared distinctive token -> plain MAYBE (recorded, internal).
341
+ const sA = makeSection("idA", "docs/a.md", "Payment Reconciliation", 2, "payment-reconciliation", "hA", "## Payment Reconciliation\nReconcile payments across the ledger.", 0);
342
+ const sB = makeSection("idB", "docs/b.md", "Payment Settlement", 2, "payment-settlement", "hB", "## Payment Settlement\nSettle payments across the ledger.", 0);
343
+ const sC = makeSection("idC", "docs/c.md", "Settlement Workflow", 2, "settlement-workflow", "hC", "## Settlement Workflow\nWorkflow that drives settlement.", 0);
344
+ // vA=(1,0,0); A·B=0.94; A·C=0.90; B·C=0.90.
345
+ const vA = [1, 0, 0];
346
+ const vB = [0.94, Math.sqrt(1 - 0.94 * 0.94), 0]; // (0.94, 0.34117..., 0)
347
+ // Solve vC: c1=0.90; B·C=0.94*0.90 + 0.34117*c2 = 0.90 -> c2=0.15828;
348
+ // c3=sqrt(1 - 0.90^2 - 0.15828^2).
349
+ const c1 = 0.9;
350
+ const c2 = (0.9 - 0.94 * 0.9) / Math.sqrt(1 - 0.94 * 0.94);
351
+ const c3 = Math.sqrt(1 - c1 * c1 - c2 * c2);
352
+ const vC = [c1, c2, c3];
353
+ const rep = analyze([sA, sB, sC], emb({ idA: vA, idB: vB, idC: vC }), null, null, cfg);
354
+ expect(rep.HighGroups).toHaveLength(1);
355
+ expect(rep.HighGroups[0].Confidence).toBe("medium");
356
+ expect(rep.HighGroups[0].Action).toBe("manual_review");
357
+ // The plain-MAYBE (A,C) pair is internal to the group, so it is NOT also
358
+ // emitted as a standalone MAYBE pair.
359
+ expect(rep.MaybePairs).toHaveLength(0);
360
+ });
361
+ });
362
+ // Possible (MAYBE) pairs are filtered: a pair whose BOTH endpoints are
363
+ // disqualified carries no actionable canonical and must be dropped before it
364
+ // reaches the report. Pinned because surfacing an un-actionable pair wastes
365
+ // reviewer time.
366
+ describe("analyze MAYBE pair filtering", () => {
367
+ // WHY: both sections sit on blacklisted (deprecated) paths, so even a genuine
368
+ // mid-band cosine pair has no canonical worth keeping -> dropped.
369
+ it("drops a MAYBE pair when both endpoints are disqualified", () => {
370
+ const sA = makeSection("id1", "docs/deprecated/a.md", "Uploading Documents", 2, "uploading-documents", "h1", "## Uploading Documents\nContent about uploading various files here.", 0);
371
+ const sB = makeSection("id2", "docs/deprecated/b.md", "Downloading Reports", 2, "downloading-reports", "h2", "## Downloading Reports\nContent about downloading various files here.", 0);
372
+ const theta = 0.451; // arccos(0.90) -> mid-band, no shared distinctive token
373
+ const v1 = [1, 0];
374
+ const v2 = [Math.cos(theta), Math.sin(theta)];
375
+ const rep = analyze([sA, sB], emb({ id1: v1, id2: v2 }), null, null, cfg);
376
+ expect(rep.MaybePairs).toHaveLength(0);
377
+ expect(rep.HighGroups).toHaveLength(0);
378
+ });
379
+ // WHY: when a surviving MAYBE pair's second endpoint (index j) ranks BETTER
380
+ // than the first (index i), the canonical must be j, not i. This pins the
381
+ // canonical-by-rank else branch in buildPair; choosing the wrong canonical
382
+ // would point reviewers at the worse section to keep.
383
+ it("elects the higher-priority endpoint as the pair canonical even when it is index j", () => {
384
+ // Index 0 (worse: plain path, deep heading); index 1 (better: docs/concepts/).
385
+ const sWorse = makeSection("id1", "docs/misc/a.md", "Uploading Documents", 4, "uploading-documents", "h1", "## Uploading Documents\nContent about uploading various files here.", 0);
386
+ const sBetter = makeSection("id2", "docs/concepts/b.md", "Downloading Reports", 2, "downloading-reports", "h2", "## Downloading Reports\nContent about downloading various files here.", 0);
387
+ const theta = 0.451;
388
+ const v1 = [1, 0];
389
+ const v2 = [Math.cos(theta), Math.sin(theta)];
390
+ const rep = analyze([sWorse, sBetter], emb({ id1: v1, id2: v2 }), null, null, cfg);
391
+ expect(rep.MaybePairs).toHaveLength(1);
392
+ // The better-ranked section (index j = id2) is elected canonical.
393
+ expect(rep.MaybePairs[0].Canonical.SectionID).toBe("id2");
394
+ expect(rep.MaybePairs[0].Candidate.SectionID).toBe("id1");
395
+ });
396
+ });
397
+ // Output ordering is part of the report contract: HIGH groups are sorted by
398
+ // (-size, -best_similarity) so the biggest/strongest duplicates lead, and MAYBE
399
+ // pairs are sorted by descending similarity. Pinned because a renderer that
400
+ // trusted input order would present duplicates in a meaningless sequence.
401
+ describe("analyze output ordering", () => {
402
+ // WHY: two HIGH groups of different sizes must be ordered largest-first
403
+ // (sortGroups via groupScore's -size term). A regression that lost the size
404
+ // ordering would bury the most impactful duplicate group below a smaller one.
405
+ it("orders HIGH groups largest-first", () => {
406
+ // Group X: 2 exact sections (hashX). Group Y: 3 exact sections (hashY).
407
+ const rawX = "## Topic X\nContent X repeated verbatim across two files here.";
408
+ const rawY = "## Topic Y\nContent Y repeated verbatim across three files here.";
409
+ const x1 = makeSection("x1", "docs/x1.md", "Topic X", 2, "topic-x", "hashX", rawX, 0);
410
+ const x2 = makeSection("x2", "docs/x2.md", "Topic X", 2, "topic-x", "hashX", rawX, 0);
411
+ const y1 = makeSection("y1", "docs/y1.md", "Topic Y", 2, "topic-y", "hashY", rawY, 0);
412
+ const y2 = makeSection("y2", "docs/y2.md", "Topic Y", 2, "topic-y", "hashY", rawY, 0);
413
+ const y3 = makeSection("y3", "docs/y3.md", "Topic Y", 2, "topic-y", "hashY", rawY, 0);
414
+ // X sections embed along dim 0, Y sections along dim 1 (orthogonal) so the
415
+ // two exact groups never cross-union via L3 cosine.
416
+ const vx = [1, 0];
417
+ const vy = [0, 1];
418
+ const rep = analyze([x1, x2, y1, y2, y3], emb({ x1: vx, x2: vx.slice(), y1: vy, y2: vy.slice(), y3: vy.slice() }), null, null, cfg);
419
+ expect(rep.HighGroups).toHaveLength(2);
420
+ // The 3-member group (Y) sorts before the 2-member group (X).
421
+ expect(rep.HighGroups[0].Members).toHaveLength(2); // canonical + 2 dups = 3
422
+ expect(rep.HighGroups[1].Members).toHaveLength(1); // canonical + 1 dup = 2
423
+ expect(rep.HighGroups[0].Canonical.Heading).toBe("Topic Y");
424
+ });
425
+ // WHY: two surviving MAYBE pairs with different similarities must be ordered
426
+ // most-similar-first (sortBySimDesc). A reviewer scans the top of the list, so
427
+ // the strongest candidate must lead.
428
+ it("orders MAYBE pairs by descending similarity", () => {
429
+ // Pair 1: cosine ~0.91. Pair 2: cosine ~0.88. Neither shares a distinctive
430
+ // heading token, so both stay MAYBE (no promotion, no union).
431
+ const p1a = makeSection("p1a", "docs/p1a.md", "Uploading Documents", 2, "uploading-documents", "h1a", "## Uploading Documents\nContent about uploading various files here.", 0);
432
+ const p1b = makeSection("p1b", "docs/p1b.md", "Downloading Reports", 2, "downloading-reports", "h1b", "## Downloading Reports\nContent about downloading various files here.", 0);
433
+ const p2a = makeSection("p2a", "docs/p2a.md", "Sending Emails", 2, "sending-emails", "h2a", "## Sending Emails\nContent about sending various messages here.", 0);
434
+ const p2b = makeSection("p2b", "docs/p2b.md", "Receiving Faxes", 2, "receiving-faxes", "h2b", "## Receiving Faxes\nContent about receiving various messages here.", 0);
435
+ // Each pair lives in its own orthogonal 4D subspace (pair1 in dims 0-1, pair2
436
+ // in dims 2-3) so the only mid-band cosines are WITHIN each pair; cross-pair
437
+ // dot products are 0 and never form spurious pairs.
438
+ const high = Math.acos(0.91);
439
+ const low = Math.acos(0.88);
440
+ const rep = analyze([p1a, p1b, p2a, p2b], emb({
441
+ p1a: [1, 0, 0, 0],
442
+ p1b: [Math.cos(high), Math.sin(high), 0, 0],
443
+ p2a: [0, 0, 1, 0],
444
+ p2b: [0, 0, Math.cos(low), Math.sin(low)],
445
+ }), null, null, cfg);
446
+ expect(rep.MaybePairs).toHaveLength(2);
447
+ // Descending similarity: the 0.91 pair leads the 0.88 pair.
448
+ expect(rep.MaybePairs[0].Similarity).toBeGreaterThan(rep.MaybePairs[1].Similarity);
449
+ });
450
+ // WHY: when the FIRST endpoint (index i) is the better canonical, buildPair
451
+ // must keep i as canonical (the then-branch). Combined with the j-better case
452
+ // above, this pins both arms of the canonical-by-rank choice for pairs.
453
+ it("keeps index i as the pair canonical when it ranks better", () => {
454
+ // index 0 sits in docs/concepts/ (top priority) -> i is the better canonical.
455
+ const sBetter = makeSection("id1", "docs/concepts/a.md", "Uploading Documents", 2, "uploading-documents", "h1", "## Uploading Documents\nContent about uploading various files here.", 0);
456
+ const sWorse = makeSection("id2", "docs/misc/b.md", "Downloading Reports", 4, "downloading-reports", "h2", "## Downloading Reports\nContent about downloading various files here.", 0);
457
+ const theta = 0.451;
458
+ const rep = analyze([sBetter, sWorse], emb({ id1: [1, 0], id2: [Math.cos(theta), Math.sin(theta)] }), null, null, cfg);
459
+ expect(rep.MaybePairs).toHaveLength(1);
460
+ expect(rep.MaybePairs[0].Canonical.SectionID).toBe("id1");
461
+ expect(rep.MaybePairs[0].Candidate.SectionID).toBe("id2");
462
+ });
463
+ });
464
+ // dotProduct must treat a missing or short embedding as cosine 0 (never NaN/throw)
465
+ // so a section without a usable vector simply forms no pairs. Pinned because a
466
+ // missing embedding must degrade gracefully, not corrupt the whole report.
467
+ describe("analyze missing/short embeddings", () => {
468
+ // WHY: a section with NO embedding entry yields dotProduct 0 (< thresh_maybe),
469
+ // producing no pair — the analyzer must not crash on the undefined vector.
470
+ it("treats a section with no embedding as cosine 0 (no pair)", () => {
471
+ const sA = makeSection("id1", "docs/a.md", "Alpha", 2, "alpha", "h1", "## Alpha\nContent for the missing-embedding test here.", 0);
472
+ const sB = makeSection("id2", "docs/b.md", "Beta", 2, "beta", "h2", "## Beta\nContent for the missing-embedding test here.", 0);
473
+ // id2 deliberately omitted from the embedding map -> vecs[1] is undefined.
474
+ const rep = analyze([sA, sB], emb({ id1: makeUnitVec2D(1, 0) }), null, null, cfg);
475
+ expect(rep.HighGroups).toHaveLength(0);
476
+ expect(rep.MaybePairs).toHaveLength(0);
477
+ });
478
+ // WHY: an embedding shorter than its counterpart (length mismatch) also yields
479
+ // cosine 0 via the length guard — defends against ragged vectors producing a
480
+ // spurious high score from a truncated dot product.
481
+ it("treats a length-mismatched embedding as cosine 0 (no pair)", () => {
482
+ const sA = makeSection("id1", "docs/a.md", "Alpha", 2, "alpha", "h1", "## Alpha\nContent for the short-embedding test here.", 0);
483
+ const sB = makeSection("id2", "docs/b.md", "Beta", 2, "beta", "h2", "## Beta\nContent for the short-embedding test here.", 0);
484
+ // id2 has an empty vector; b.length (0) < n (2) -> dotProduct returns 0.
485
+ const rep = analyze([sA, sB], emb({ id1: [1, 0], id2: [] }), null, null, cfg);
486
+ expect(rep.HighGroups).toHaveLength(0);
487
+ expect(rep.MaybePairs).toHaveLength(0);
488
+ });
489
+ });
490
+ // L5 secGroup wiring: when a HIGH group exists AND blocks are supplied, the
491
+ // section->group map must be populated so suppressKnownGroups can drop block
492
+ // overlaps already covered by that group. Pinned because an unpopulated secGroup
493
+ // would let L5 double-report blocks already captured by L1-L4.
494
+ describe("analyze L5 secGroup population", () => {
495
+ // WHY: two grouped sections each contribute a verbatim block; because both
496
+ // blocks' sections are in the SAME HIGH group, the block cluster is suppressed
497
+ // (already known) rather than re-reported by L5.
498
+ it("suppresses a block cluster whose sections are all in one HIGH group", () => {
499
+ const hash = "l5grouphash12345";
500
+ const raw = "## Lifecycle\nThe shared lifecycle content used in both docs.";
501
+ const sA = makeSection("secA", "docs/a.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
502
+ const sB = makeSection("secB", "docs/b.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
503
+ const v = makeUnitVec2D(1, 0);
504
+ // A verbatim block shared by both grouped sections.
505
+ const blockHash = "covered-block-hash";
506
+ const blocks = [
507
+ mkBlock("secA", "docs/a.md", "Lifecycle", "prose", 3, 6, blockHash),
508
+ mkBlock("secB", "docs/b.md", "Lifecycle", "prose", 3, 6, blockHash),
509
+ ];
510
+ const rep = analyze([sA, sB], emb({ secA: v, secB: v.slice() }), blocks, null, cfg);
511
+ expect(rep.HighGroups.length).toBeGreaterThan(0);
512
+ // Both block locations resolve to the same HIGH group -> suppressed.
513
+ expect(rep.PartialOverlaps).toHaveLength(0);
514
+ });
515
+ });
516
+ // mkBlock builds a minimal BlockRecord for the L5 wiring tests.
517
+ function mkBlock(sectionID, filePath, heading, kind, startLine, endLine, contentHash) {
518
+ return {
519
+ SectionID: sectionID,
520
+ FilePath: filePath,
521
+ Heading: heading,
522
+ Index: 0,
523
+ Kind: kind,
524
+ StartLine: startLine,
525
+ EndLine: endLine,
526
+ ContentHash: contentHash,
527
+ Text: "",
528
+ TableRows: 0,
529
+ };
530
+ }
@@ -0,0 +1,74 @@
1
+ // Ported from internal/dedup/analyzer/canonical.go.
2
+ //
3
+ // Canonical-section selection: each duplicate group elects the section with the
4
+ // lowest rank tuple as its canonical reference. The tuple ordering is the
5
+ // behavior contract — drifting any field's comparison re-elects a different
6
+ // canonical and changes the recommended edits, so the order is pinned exactly.
7
+ import { headingBlacklisted } from "../dedupcfg/index.js";
8
+ /**
9
+ * isDisqualified returns true if the section matches the heading or path
10
+ * blacklist (case-insensitive substring on the path, shared heading matcher).
11
+ */
12
+ export function isDisqualified(s, cfg) {
13
+ if (headingBlacklisted(cfg, s.heading)) {
14
+ return true;
15
+ }
16
+ const path = s.file_path.toLowerCase();
17
+ for (const tok of cfg.path_blacklist) {
18
+ if (path.includes(tok.toLowerCase())) {
19
+ return true;
20
+ }
21
+ }
22
+ return false;
23
+ }
24
+ /**
25
+ * canonicalRank computes the rank tuple for a section.
26
+ * Lower tuple (compared via lessRank) = better canonical.
27
+ *
28
+ * NegLen mirrors Go's -len(raw_content), which is the UTF-8 BYTE length, not the
29
+ * rune/UTF-16 count — kept exact so the longest-content tiebreak matches Go.
30
+ */
31
+ export function canonicalRank(s, cfg) {
32
+ const disq = isDisqualified(s, cfg) ? 1 : 0;
33
+ return {
34
+ Disqualified: disq,
35
+ PathPriority: pathPriorityRank(s.file_path, cfg.path_priority),
36
+ NegInbound: -s.inbound_count,
37
+ HeadingLevel: s.heading_level,
38
+ NegLen: -Buffer.byteLength(s.raw_content, "utf8"),
39
+ SectionID: s.id,
40
+ };
41
+ }
42
+ /** lessRank returns true if a is strictly better (lower) than b. */
43
+ export function lessRank(a, b) {
44
+ if (a.Disqualified !== b.Disqualified) {
45
+ return a.Disqualified < b.Disqualified;
46
+ }
47
+ if (a.PathPriority !== b.PathPriority) {
48
+ return a.PathPriority < b.PathPriority;
49
+ }
50
+ if (a.NegInbound !== b.NegInbound) {
51
+ return a.NegInbound < b.NegInbound;
52
+ }
53
+ if (a.HeadingLevel !== b.HeadingLevel) {
54
+ return a.HeadingLevel < b.HeadingLevel;
55
+ }
56
+ if (a.NegLen !== b.NegLen) {
57
+ return a.NegLen < b.NegLen;
58
+ }
59
+ return a.SectionID < b.SectionID;
60
+ }
61
+ /**
62
+ * pathPriorityRank returns the path priority index for the given file path.
63
+ * Lower index = higher priority. Path is case-folded before comparison.
64
+ */
65
+ export function pathPriorityRank(filePath, pathPriority) {
66
+ const p = filePath.toLowerCase();
67
+ for (let i = 0; i < pathPriority.length; i++) {
68
+ const prefix = pathPriority[i];
69
+ if (p.startsWith(prefix.toLowerCase())) {
70
+ return i;
71
+ }
72
+ }
73
+ return pathPriority.length;
74
+ }