docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/README.md +242 -0
  2. package/dist/apispec/apispec.js +401 -0
  3. package/dist/apispec/apispec.test.js +444 -0
  4. package/dist/apispec/errors.js +17 -0
  5. package/dist/apispec/index.js +2 -0
  6. package/dist/check/doclinks.js +167 -0
  7. package/dist/check/index.js +8 -0
  8. package/dist/check/run.js +391 -0
  9. package/dist/check/run.test.js +513 -0
  10. package/dist/check/suggest.js +134 -0
  11. package/dist/check/suggest.test.js +92 -0
  12. package/dist/check/tokens.js +125 -0
  13. package/dist/cmd/main.js +330 -0
  14. package/dist/cmd/main.test.js +422 -0
  15. package/dist/codeq/cache.js +71 -0
  16. package/dist/codeq/cache.test.js +67 -0
  17. package/dist/codeq/errors.js +52 -0
  18. package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
  19. package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
  20. package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
  21. package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
  22. package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
  23. package/dist/codeq/index.js +11 -0
  24. package/dist/codeq/resolve.test.js +109 -0
  25. package/dist/codeq/resolver.js +128 -0
  26. package/dist/codeq/resolver.test.js +124 -0
  27. package/dist/codeq/resolvers/go.js +242 -0
  28. package/dist/codeq/resolvers/go.test.js +143 -0
  29. package/dist/codeq/resolvers/java.js +349 -0
  30. package/dist/codeq/resolvers/java.test.js +138 -0
  31. package/dist/codeq/resolvers/java_queries.js +63 -0
  32. package/dist/codeq/resolvers/javascript.js +412 -0
  33. package/dist/codeq/resolvers/javascript.test.js +125 -0
  34. package/dist/codeq/resolvers/javascript_queries.js +46 -0
  35. package/dist/codeq/resolvers/typescript.js +366 -0
  36. package/dist/codeq/resolvers/typescript.test.js +180 -0
  37. package/dist/codeq/resolvers/typescript_queries.js +78 -0
  38. package/dist/codeq/signature.js +50 -0
  39. package/dist/codeq/signature.test.js +50 -0
  40. package/dist/codeq/suggest.js +96 -0
  41. package/dist/codeq/treesitter.js +122 -0
  42. package/dist/codeq/treesitter.test.js +118 -0
  43. package/dist/config/config.js +74 -0
  44. package/dist/config/config.test.js +98 -0
  45. package/dist/config/fs.js +116 -0
  46. package/dist/config/glob.js +82 -0
  47. package/dist/config/glob.test.js +61 -0
  48. package/dist/config/index.js +4 -0
  49. package/dist/dedup/analyzer/analyzer.js +533 -0
  50. package/dist/dedup/analyzer/analyzer.test.js +530 -0
  51. package/dist/dedup/analyzer/canonical.js +74 -0
  52. package/dist/dedup/analyzer/canonical.test.js +70 -0
  53. package/dist/dedup/analyzer/cosine_clusters.js +169 -0
  54. package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
  55. package/dist/dedup/analyzer/distinctive.js +85 -0
  56. package/dist/dedup/analyzer/distinctive.test.js +49 -0
  57. package/dist/dedup/analyzer/exact_clusters.js +63 -0
  58. package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
  59. package/dist/dedup/analyzer/index.js +14 -0
  60. package/dist/dedup/analyzer/multiplicity.js +110 -0
  61. package/dist/dedup/analyzer/multiplicity.test.js +123 -0
  62. package/dist/dedup/analyzer/order.js +22 -0
  63. package/dist/dedup/analyzer/partial_overlaps.js +65 -0
  64. package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
  65. package/dist/dedup/analyzer/preview.js +84 -0
  66. package/dist/dedup/analyzer/preview.test.js +46 -0
  67. package/dist/dedup/analyzer/safety.js +27 -0
  68. package/dist/dedup/analyzer/safety.test.js +39 -0
  69. package/dist/dedup/config.js +18 -0
  70. package/dist/dedup/configload.js +299 -0
  71. package/dist/dedup/configload.test.js +410 -0
  72. package/dist/dedup/dedup.index.test.js +203 -0
  73. package/dist/dedup/dedup.js +143 -0
  74. package/dist/dedup/dedup.test.js +212 -0
  75. package/dist/dedup/dedupcfg/config.js +112 -0
  76. package/dist/dedup/dedupcfg/config.test.js +70 -0
  77. package/dist/dedup/dedupcfg/index.js +1 -0
  78. package/dist/dedup/deduptypes/index.js +1 -0
  79. package/dist/dedup/deduptypes/types.js +9 -0
  80. package/dist/dedup/deduptypes/types.test.js +34 -0
  81. package/dist/dedup/embedder/cache.js +23 -0
  82. package/dist/dedup/embedder/cache.test.js +50 -0
  83. package/dist/dedup/embedder/constants.js +10 -0
  84. package/dist/dedup/embedder/embedder.js +76 -0
  85. package/dist/dedup/embedder/embedder.mock.test.js +128 -0
  86. package/dist/dedup/embedder/embedder.test.js +96 -0
  87. package/dist/dedup/embedder/errors.js +20 -0
  88. package/dist/dedup/embedder/errors.test.js +35 -0
  89. package/dist/dedup/embedder/index.js +4 -0
  90. package/dist/dedup/embedder/session.js +78 -0
  91. package/dist/dedup/embedder/session.test.js +172 -0
  92. package/dist/dedup/gitignore.js +97 -0
  93. package/dist/dedup/gitignore.test.js +98 -0
  94. package/dist/dedup/index.js +11 -0
  95. package/dist/dedup/indexdb/errors.js +48 -0
  96. package/dist/dedup/indexdb/index.js +6 -0
  97. package/dist/dedup/indexdb/indexdb.js +302 -0
  98. package/dist/dedup/indexdb/indexdb.test.js +739 -0
  99. package/dist/dedup/indexdb/load.js +110 -0
  100. package/dist/dedup/indexdb/migrations.js +58 -0
  101. package/dist/dedup/indexdb/schema.js +83 -0
  102. package/dist/dedup/indexer/index.js +9 -0
  103. package/dist/dedup/indexer/indexer.js +501 -0
  104. package/dist/dedup/indexer/indexer.test.js +510 -0
  105. package/dist/dedup/indexer/links.js +89 -0
  106. package/dist/dedup/mdsection/anchor.js +60 -0
  107. package/dist/dedup/mdsection/anchor.test.js +39 -0
  108. package/dist/dedup/mdsection/blocks.js +409 -0
  109. package/dist/dedup/mdsection/blocks.test.js +359 -0
  110. package/dist/dedup/mdsection/index.js +4 -0
  111. package/dist/dedup/mdsection/parse.js +21 -0
  112. package/dist/dedup/mdsection/section.js +234 -0
  113. package/dist/dedup/mdsection/section.test.js +221 -0
  114. package/dist/dedup/report/floatfmt.js +71 -0
  115. package/dist/dedup/report/floatfmt.test.js +42 -0
  116. package/dist/dedup/report/index.js +8 -0
  117. package/dist/dedup/report/quote.js +77 -0
  118. package/dist/dedup/report/quote.test.js +67 -0
  119. package/dist/dedup/report/text.js +251 -0
  120. package/dist/dedup/report/text.test.js +420 -0
  121. package/dist/dedup/report_types.js +8 -0
  122. package/dist/dedup/sectionid/index.js +1 -0
  123. package/dist/dedup/sectionid/sectionid.js +16 -0
  124. package/dist/dedup/sectionid/sectionid.test.js +49 -0
  125. package/dist/guard/api/errors.js +12 -0
  126. package/dist/guard/api/index.js +2 -0
  127. package/dist/guard/api/parser.js +81 -0
  128. package/dist/guard/api/parser.test.js +58 -0
  129. package/dist/guard/api/types.js +1 -0
  130. package/dist/guard/code/errors.js +16 -0
  131. package/dist/guard/code/index.js +2 -0
  132. package/dist/guard/code/parser.js +54 -0
  133. package/dist/guard/code/parser.test.js +111 -0
  134. package/dist/guard/code/types.js +6 -0
  135. package/dist/index.js +1 -0
  136. package/dist/index.test.js +5 -0
  137. package/dist/repo/boundary.js +92 -0
  138. package/dist/repo/boundary.test.js +65 -0
  139. package/dist/repo/errors.js +56 -0
  140. package/dist/repo/errors.test.js +85 -0
  141. package/dist/repo/exists.test.js +72 -0
  142. package/dist/repo/filename.js +46 -0
  143. package/dist/repo/filename.test.js +39 -0
  144. package/dist/repo/fs.js +53 -0
  145. package/dist/repo/index.js +7 -0
  146. package/dist/repo/overlay.js +36 -0
  147. package/dist/repo/overlay.test.js +80 -0
  148. package/dist/repo/repo.js +353 -0
  149. package/dist/repo/repo.test.js +255 -0
  150. package/dist/repo/testutil.js +27 -0
  151. package/dist/repo/write.test.js +125 -0
  152. package/dist/report/color.js +73 -0
  153. package/dist/report/index.js +1 -0
  154. package/dist/report/report.js +112 -0
  155. package/dist/report/report.test.js +368 -0
  156. package/dist/violation/index.js +1 -0
  157. package/dist/violation/types.js +22 -0
  158. package/dist/violation/types.test.js +70 -0
  159. package/package.json +48 -0
@@ -0,0 +1,70 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { defaultConfig } from "../dedupcfg/index.js";
3
+ import { canonicalRank, isDisqualified, lessRank } from "./canonical.js";
4
+ // makeSection builds a minimal Section (TS snake_case shape) for rank tests.
5
+ function makeSection(id, filePath, heading, level, anchor, contentHash, rawContent, inbound) {
6
+ return {
7
+ id,
8
+ file_path: filePath,
9
+ heading,
10
+ heading_level: level,
11
+ anchor,
12
+ start_line: 1,
13
+ end_line: 10,
14
+ content_hash: contentHash,
15
+ raw_content: rawContent,
16
+ embed_text: heading,
17
+ prose_word_count: 15,
18
+ has_table: false,
19
+ has_code: false,
20
+ inbound_count: inbound,
21
+ };
22
+ }
23
+ // Disqualification is the FIRST canonical-rank field, so a disqualified section
24
+ // can never win election even if it is otherwise the best. These tests pin both
25
+ // blacklist axes (heading + path) and the non-match case so the disqualifier
26
+ // neither over- nor under-fires.
27
+ describe("isDisqualified", () => {
28
+ const cfg = defaultConfig().Analyzer;
29
+ it("disqualifies a blacklisted heading ('Related')", () => {
30
+ const sec = makeSection("id1", "docs/concepts/foo.md", "Related", 2, "related", "hash1", "## Related\nSome content.", 0);
31
+ expect(isDisqualified(sec, cfg)).toBe(true);
32
+ });
33
+ it("disqualifies a blacklisted path ('changelog')", () => {
34
+ const sec = makeSection("id1", "docs/changelog/foo.md", "Overview", 2, "overview", "hash1", "## Overview\nSome content.", 0);
35
+ expect(isDisqualified(sec, cfg)).toBe(true);
36
+ });
37
+ it("does not disqualify a clean section", () => {
38
+ const sec = makeSection("id1", "docs/concepts/foo.md", "Order Lifecycle", 2, "order-lifecycle", "hash1", "## Order Lifecycle\nSome content.", 0);
39
+ expect(isDisqualified(sec, cfg)).toBe(false);
40
+ });
41
+ });
42
+ // The canonical-rank tuple ordering decides WHICH section becomes the reference
43
+ // every other duplicate points to. Each test pins one field's contribution so
44
+ // the election cannot silently re-order. lessRank encodes the lower-is-better
45
+ // comparison the orchestrator uses to pick the winner.
46
+ describe("canonicalRank + lessRank", () => {
47
+ const cfg = defaultConfig().Analyzer;
48
+ it("ranks a better path (docs/concepts) lower than docs/other", () => {
49
+ const conceptSec = makeSection("id1", "docs/concepts/foo.md", "Lifecycle", 2, "lifecycle", "hash1", "content", 0);
50
+ const otherSec = makeSection("id2", "docs/other/foo.md", "Lifecycle", 2, "lifecycle", "hash2", "content", 0);
51
+ expect(lessRank(canonicalRank(conceptSec, cfg), canonicalRank(otherSec, cfg))).toBe(true);
52
+ });
53
+ it("ranks higher inbound count lower (more incoming links wins)", () => {
54
+ const highInbound = makeSection("id1", "docs/concepts/foo.md", "Lifecycle", 2, "lifecycle", "hash1", "content", 5);
55
+ const lowInbound = makeSection("id2", "docs/concepts/bar.md", "Lifecycle", 2, "lifecycle", "hash2", "content", 0);
56
+ expect(lessRank(canonicalRank(highInbound, cfg), canonicalRank(lowInbound, cfg))).toBe(true);
57
+ });
58
+ it("ranks a disqualified section above (worse than) a qualified one", () => {
59
+ const disq = makeSection("id1", "docs/concepts/foo.md", "Related", 2, "related", "hash1", "content", 0);
60
+ const notDisq = makeSection("id2", "docs/concepts/foo.md", "Overview", 2, "overview", "hash2", "content", 0);
61
+ expect(lessRank(canonicalRank(notDisq, cfg), canonicalRank(disq, cfg))).toBe(true);
62
+ });
63
+ it("breaks ties on longer raw_content (NegLen), measured in UTF-8 bytes", () => {
64
+ // WHY: NegLen is -byteLength so the longer section wins. Pin the byte-length
65
+ // semantics: a 1-char multibyte string must out-rank a shorter ASCII one.
66
+ const longer = makeSection("id1", "docs/concepts/a.md", "Lifecycle", 2, "lifecycle", "h1", "abcde", 0);
67
+ const shorter = makeSection("id2", "docs/concepts/b.md", "Lifecycle", 2, "lifecycle", "h2", "ab", 0);
68
+ expect(lessRank(canonicalRank(longer, cfg), canonicalRank(shorter, cfg))).toBe(true);
69
+ });
70
+ });
@@ -0,0 +1,169 @@
1
+ // Ported from internal/dedup/analyzer/cosine_clusters.go.
2
+ //
3
+ // L5-cosine clustering over prose blocks. The cosine math, the >=threshold gate,
4
+ // the same-file-pair skip, and the min-qualifying-cosine-per-cluster rule are all
5
+ // behavior-load-bearing: they decide which near-copy blocks cluster together and
6
+ // what Similarity is reported, so they are reproduced exactly (including the
7
+ // float32->float64 dot-product accumulation and the union-find component merge).
8
+ import { cmpStr, cmpNum } from "./order.js";
9
+ /**
10
+ * dotProduct computes the dot product of two float32 vectors as float64.
11
+ * For L2-normalized vectors this equals the cosine similarity.
12
+ *
13
+ * Go (analyzer.go:dotProduct) accumulates in float64 over float32 elements;
14
+ * JS numbers are float64, and the vectors here are plain number[] sourced from
15
+ * the same embeddings, so the accumulation matches.
16
+ */
17
+ function dotProduct(a, b) {
18
+ const n = a.length;
19
+ if (n === 0 || b.length < n) {
20
+ return 0;
21
+ }
22
+ let sum = 0;
23
+ for (let i = 0; i < n; i++) {
24
+ sum += a[i] * b[i];
25
+ }
26
+ return sum;
27
+ }
28
+ /**
29
+ * cosineClusters performs L5-cosine clustering over prose blocks only.
30
+ *
31
+ * It considers only blocks whose ContentHash is in blockEmb (prose with a vector)
32
+ * AND NOT in excludeHashes (exact-cluster hashes already reported). Tables (absent
33
+ * from blockEmb) are naturally excluded.
34
+ *
35
+ * All-pairs over the considered blocks (i<j): same-FilePath pairs are skipped. If
36
+ * dotProduct(vi, vj) >= cfg.Block.cosine_threshold, i and j are unioned via
37
+ * union-find.
38
+ *
39
+ * For each resulting component of >=2 blocks a Cluster is emitted:
40
+ * - Exact:false, Kind:"prose", ContentHash:"" (members differ)
41
+ * - Similarity = min qualifying pairwise cosine within the cluster
42
+ * - Informational:false
43
+ * - Locations sorted by (FilePath, StartLine)
44
+ *
45
+ * Clusters are sorted by first location (FilePath, StartLine) for determinism.
46
+ */
47
+ export function cosineClusters(blocks, blockEmb, excludeHashes, cfg) {
48
+ const threshold = cfg.Block.cosine_threshold;
49
+ const cands = [];
50
+ for (const b of blocks) {
51
+ if (excludeHashes.has(b.ContentHash)) {
52
+ continue;
53
+ }
54
+ const v = blockEmb.get(b.ContentHash);
55
+ if (v === undefined) {
56
+ continue;
57
+ }
58
+ cands.push({ block: b, vec: v });
59
+ }
60
+ const n = cands.length;
61
+ if (n < 2) {
62
+ return [];
63
+ }
64
+ // Union-find (path-compressed) over candidate indices.
65
+ const parent = [];
66
+ for (let i = 0; i < n; i++) {
67
+ parent[i] = i;
68
+ }
69
+ const find = (x) => {
70
+ while (parent[x] !== x) {
71
+ parent[x] = parent[parent[x]];
72
+ x = parent[x];
73
+ }
74
+ return x;
75
+ };
76
+ const union = (a, b) => {
77
+ const ra = find(a);
78
+ const rb = find(b);
79
+ if (ra !== rb) {
80
+ parent[ra] = rb;
81
+ }
82
+ };
83
+ const edges = [];
84
+ for (let i = 0; i < n; i++) {
85
+ for (let j = i + 1; j < n; j++) {
86
+ // Skip same-file pairs.
87
+ if (cands[i].block.FilePath === cands[j].block.FilePath) {
88
+ continue;
89
+ }
90
+ const cos = dotProduct(cands[i].vec, cands[j].vec);
91
+ if (cos >= threshold) {
92
+ union(i, j);
93
+ edges.push({ i, j, cos });
94
+ }
95
+ }
96
+ }
97
+ // Assign each candidate to its root component.
98
+ const compMembers = new Map();
99
+ for (let i = 0; i < n; i++) {
100
+ const root = find(i);
101
+ const list = compMembers.get(root);
102
+ if (list) {
103
+ list.push(i);
104
+ }
105
+ else {
106
+ compMembers.set(root, [i]);
107
+ }
108
+ }
109
+ // Compute minimum qualifying cosine per component from recorded edges.
110
+ // Initialize to a value above threshold so the first edge always wins.
111
+ const compMinCos = new Map();
112
+ for (const root of compMembers.keys()) {
113
+ compMinCos.set(root, 2.0);
114
+ }
115
+ for (const e of edges) {
116
+ const root = find(e.i); // both ends in same component after union
117
+ if (e.cos < compMinCos.get(root)) {
118
+ compMinCos.set(root, e.cos);
119
+ }
120
+ }
121
+ // Build clusters for components with >=2 members.
122
+ const clusters = [];
123
+ for (const [root, idxs] of compMembers) {
124
+ if (idxs.length < 2) {
125
+ continue;
126
+ }
127
+ const locs = [];
128
+ for (const idx of idxs) {
129
+ const b = cands[idx].block;
130
+ locs.push({
131
+ FilePath: b.FilePath,
132
+ Heading: b.Heading,
133
+ StartLine: b.StartLine,
134
+ EndLine: b.EndLine,
135
+ });
136
+ }
137
+ // Sort locations by (FilePath, StartLine).
138
+ locs.sort((a, b) => {
139
+ const c = cmpStr(a.FilePath, b.FilePath);
140
+ if (c !== 0)
141
+ return c;
142
+ return cmpNum(a.StartLine, b.StartLine);
143
+ });
144
+ let minCos = compMinCos.get(root);
145
+ if (minCos > 1.0) {
146
+ // No edge recorded (shouldn't happen if len>=2 and union was called) — use threshold.
147
+ minCos = threshold;
148
+ }
149
+ clusters.push({
150
+ Kind: "prose",
151
+ ContentHash: "",
152
+ Similarity: minCos,
153
+ Exact: false,
154
+ Informational: false,
155
+ Locations: locs,
156
+ });
157
+ }
158
+ // Sort clusters by first location (FilePath, StartLine) for determinism.
159
+ clusters.sort((a, b) => {
160
+ if (a.Locations.length === 0 || b.Locations.length === 0) {
161
+ return 0;
162
+ }
163
+ const c = cmpStr(a.Locations[0].FilePath, b.Locations[0].FilePath);
164
+ if (c !== 0)
165
+ return c;
166
+ return cmpNum(a.Locations[0].StartLine, b.Locations[0].StartLine);
167
+ });
168
+ return clusters;
169
+ }
@@ -0,0 +1,131 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { defaultConfig } from "../dedupcfg/index.js";
3
+ import { cosineClusters } from "./cosine_clusters.js";
4
+ // Minimal BlockRecord builder for the cosine pass (only path/kind/lines/hash
5
+ // drive clustering).
6
+ function block(filePath, heading, kind, startLine, endLine, contentHash) {
7
+ return {
8
+ SectionID: "",
9
+ FilePath: filePath,
10
+ Heading: heading,
11
+ Index: 0,
12
+ Kind: kind,
13
+ StartLine: startLine,
14
+ EndLine: endLine,
15
+ ContentHash: contentHash,
16
+ Text: "",
17
+ TableRows: 0,
18
+ };
19
+ }
20
+ function copyVec(v) {
21
+ return v.slice();
22
+ }
23
+ // cosineClusters is the L5-cosine pass. The exact set of rules it enforces is
24
+ // the contract a single test pins, because each rule prevents a specific class
25
+ // of false positive/negative in near-duplicate detection:
26
+ // (a) cross-file pairs at cosine >= threshold cluster (the whole point),
27
+ // (b) below-threshold pairs do NOT cluster (no spurious merges),
28
+ // (c) tables (no embedding) never cluster (cosine is prose-only),
29
+ // (d) same-file pairs are skipped (intra-file structure, not cross-doc dup),
30
+ // (e) excludeHashes (already an exact cluster) are not re-reported,
31
+ // (f) output order is deterministic (report/index consume it).
32
+ describe("cosineClusters", () => {
33
+ const cfg = defaultConfig(); // Block.cosine_threshold = 0.95
34
+ // Build the fixture once; reused for determinism check.
35
+ function fixture() {
36
+ const vA = [1.0, 0.0];
37
+ const vB = [Math.cos(0.2257), Math.sin(0.2257)]; // cosine with vA ~ 0.975 (> 0.95)
38
+ const vC = [0.0, 1.0]; // cosine with vA = 0 (< threshold)
39
+ const vSameFile = [0.0, -1.0]; // perpendicular to all others
40
+ const vE = [Math.cos(0.15), Math.sin(0.15)];
41
+ const hashA = "prose-hash-a";
42
+ const hashBx = "prose-hash-bx";
43
+ const hashC = "prose-hash-c";
44
+ const hashTable = "table-hash";
45
+ const hashG1 = "prose-g1";
46
+ const hashG2 = "prose-g2";
47
+ const hashExcl = "prose-excl";
48
+ const hashE = "prose-hash-e";
49
+ const hashF = "prose-hash-f";
50
+ const blocks = [
51
+ // (a) cross-file near-copy pair
52
+ block("docs/a.md", "Intro", "prose", 1, 5, hashA),
53
+ block("docs/b.md", "Overview", "prose", 2, 6, hashBx),
54
+ // (b) below-threshold
55
+ block("docs/c.md", "Other", "prose", 1, 3, hashC),
56
+ // (c) table block (absent from blockEmb)
57
+ block("docs/a.md", "Table", "table", 6, 10, hashTable),
58
+ // (d) same-file pair in docs/g.md only
59
+ block("docs/g.md", "G1", "prose", 1, 5, hashG1),
60
+ block("docs/g.md", "G2", "prose", 6, 10, hashG2),
61
+ // (e) excluded hash
62
+ block("docs/d.md", "Dup", "prose", 1, 5, hashExcl),
63
+ // (f) second cross-file pair
64
+ block("docs/e.md", "Appendix", "prose", 1, 4, hashE),
65
+ block("docs/f.md", "Appendix", "prose", 1, 4, hashF),
66
+ ];
67
+ const blockEmb = new Map([
68
+ [hashA, vA],
69
+ [hashBx, vB],
70
+ [hashC, vC],
71
+ [hashG1, vSameFile],
72
+ [hashG2, copyVec(vSameFile)],
73
+ [hashExcl, vA], // would pair with hashA but is excluded
74
+ [hashE, vE],
75
+ [hashF, copyVec(vE)],
76
+ // hashTable intentionally absent
77
+ ]);
78
+ const excludeHashes = new Set([hashExcl]);
79
+ return { blocks, blockEmb, excludeHashes };
80
+ }
81
+ it("applies all six clustering rules (a-f)", () => {
82
+ const { blocks, blockEmb, excludeHashes } = fixture();
83
+ const clusters = cosineClusters(blocks, blockEmb, excludeHashes, cfg);
84
+ // (c) tables never cluster.
85
+ for (const cl of clusters) {
86
+ for (const loc of cl.Locations) {
87
+ expect(loc.StartLine === 6 && loc.FilePath === "docs/a.md" && loc.EndLine === 10).toBe(false);
88
+ }
89
+ }
90
+ // (d) same-file-only docs/g.md blocks never appear.
91
+ for (const cl of clusters) {
92
+ for (const loc of cl.Locations) {
93
+ expect(loc.FilePath).not.toBe("docs/g.md");
94
+ }
95
+ }
96
+ // (e) excluded hash (docs/d.md) never appears.
97
+ for (const cl of clusters) {
98
+ for (const loc of cl.Locations) {
99
+ expect(loc.FilePath).not.toBe("docs/d.md");
100
+ }
101
+ }
102
+ // (b) below-threshold block (docs/c.md) never appears.
103
+ for (const cl of clusters) {
104
+ for (const loc of cl.Locations) {
105
+ expect(loc.FilePath).not.toBe("docs/c.md");
106
+ }
107
+ }
108
+ // (a) a cluster with docs/a.md:1 and docs/b.md exists.
109
+ const abCluster = clusters.find((cl) => {
110
+ const hasA = cl.Locations.some((l) => l.FilePath === "docs/a.md" && l.StartLine === 1);
111
+ const hasB = cl.Locations.some((l) => l.FilePath === "docs/b.md");
112
+ return hasA && hasB;
113
+ });
114
+ expect(abCluster).toBeDefined();
115
+ expect(abCluster.Exact).toBe(false);
116
+ expect(abCluster.ContentHash).toBe("");
117
+ expect(abCluster.Kind).toBe("prose");
118
+ expect(abCluster.Similarity).toBeGreaterThanOrEqual(cfg.Block.cosine_threshold);
119
+ });
120
+ it("(f) is deterministic across repeated calls", () => {
121
+ const { blocks, blockEmb, excludeHashes } = fixture();
122
+ const c1 = cosineClusters(blocks, blockEmb, excludeHashes, cfg);
123
+ const c2 = cosineClusters(blocks, blockEmb, excludeHashes, cfg);
124
+ expect(c1.length).toBe(c2.length);
125
+ for (let i = 0; i < c1.length; i++) {
126
+ expect(c1[i].Kind).toBe(c2[i].Kind);
127
+ expect(c1[i].Similarity).toBe(c2[i].Similarity);
128
+ expect(c1[i].Locations).toEqual(c2[i].Locations);
129
+ }
130
+ });
131
+ });
@@ -0,0 +1,85 @@
1
+ // Ported from internal/dedup/analyzer/distinctive.go.
2
+ //
3
+ // Distinctive-heading-token filtering for L2 promotion: a token is "distinctive"
4
+ // when it is rare across the corpus of headings. Two headings sharing a
5
+ // distinctive token get promoted from MAYBE to HIGH, so the frequency threshold
6
+ // and tokenization are behavior-load-bearing and pinned to the Go output.
7
+ /**
8
+ * Matches runs of ASCII letters and CJK characters (Unified Ideographs block).
9
+ * Mirrors the Go headingTokenRE = `[A-Za-z\x{4E00}-\x{9FFF}]+` (Python POC's
10
+ * _HEADING_TOKEN_RE = re.compile(r"[A-Za-z一-鿿]+")). The `u` flag makes the
11
+ * 一-鿿 range a single code-point class, matching Go's rune semantics.
12
+ */
13
+ const headingTokenRE = /[A-Za-z一-鿿]+/gu;
14
+ /**
15
+ * buildDistinctiveFilter computes the set of tokens that are considered
16
+ * "distinctive" in the given corpus of headings.
17
+ *
18
+ * A token is distinctive iff its frequency across distinct headings is
19
+ * <= max(distinctive_abs_min, trunc(distinctive_pct_of_headings * total_headings)).
20
+ * Universal stopwords and tokens shorter than minTokenLen are excluded.
21
+ *
22
+ * @param minTokenLen minimum rune length (default 3)
23
+ */
24
+ export function buildDistinctiveFilter(headings, cfg, minTokenLen = 3) {
25
+ const stopwords = new Set(cfg.universal_stopwords);
26
+ const freq = new Map();
27
+ let total = 0;
28
+ for (const h of headings) {
29
+ total++;
30
+ for (const t of headingTokensOf(h, minTokenLen, stopwords)) {
31
+ freq.set(t, (freq.get(t) ?? 0) + 1);
32
+ }
33
+ }
34
+ if (total === 0) {
35
+ return new Set();
36
+ }
37
+ let threshold = cfg.distinctive_abs_min;
38
+ // Go's int(float64(total) * pct) truncates toward zero.
39
+ const pctThreshold = Math.trunc(total * cfg.distinctive_pct_of_headings);
40
+ if (pctThreshold > threshold) {
41
+ threshold = pctThreshold;
42
+ }
43
+ const filter = new Set();
44
+ for (const [t, c] of freq) {
45
+ if (c <= threshold) {
46
+ filter.add(t);
47
+ }
48
+ }
49
+ return filter;
50
+ }
51
+ /**
52
+ * distinctiveTokensOf returns the set of distinctive tokens present in heading.
53
+ *
54
+ * @param minTokenLen minimum rune length (default 3)
55
+ */
56
+ export function distinctiveTokensOf(heading, distinctiveFilter, cfg, minTokenLen = 3) {
57
+ const stopwords = new Set(cfg.universal_stopwords);
58
+ const tokens = headingTokensOf(heading, minTokenLen, stopwords);
59
+ const result = new Set();
60
+ for (const t of tokens) {
61
+ if (distinctiveFilter.has(t)) {
62
+ result.add(t);
63
+ }
64
+ }
65
+ return result;
66
+ }
67
+ /**
68
+ * headingTokensOf tokenizes a heading: case-folds, keeps [A-Za-z一-鿿]+ runs,
69
+ * drops stopwords, drops tokens shorter than minLen (measured in runes).
70
+ */
71
+ function headingTokensOf(heading, minLen, stopwords) {
72
+ const matches = heading.toLowerCase().match(headingTokenRE) ?? [];
73
+ const result = new Set();
74
+ for (const t of matches) {
75
+ // Go: len([]rune(t)) — rune count, not UTF-16 length.
76
+ if ([...t].length < minLen) {
77
+ continue;
78
+ }
79
+ if (stopwords.has(t)) {
80
+ continue;
81
+ }
82
+ result.add(t);
83
+ }
84
+ return result;
85
+ }
@@ -0,0 +1,49 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { defaultConfig } from "../dedupcfg/index.js";
3
+ import { buildDistinctiveFilter, distinctiveTokensOf } from "./distinctive.js";
4
+ // Distinctive tokens drive L2 promotion (MAYBE -> HIGH when two headings share a
5
+ // rare token). The frequency threshold, stopword drop, and min-length drop each
6
+ // gate which tokens count, so a drift would re-promote or under-promote pairs.
7
+ // These tests pin each gate by WHY it exists.
8
+ describe("buildDistinctiveFilter", () => {
9
+ const cfg = defaultConfig().Analyzer;
10
+ it("returns rare tokens (freq <= threshold)", () => {
11
+ const headings = [
12
+ "Order Lifecycle",
13
+ "Order Management",
14
+ "Payment Processing",
15
+ "Payment Gateway",
16
+ "Authentication Flow",
17
+ ];
18
+ // 5 headings -> threshold = max(3, trunc(0.03*5)) = max(3, 0) = 3.
19
+ // "lifecycle" appears once -> distinctive.
20
+ const filter = buildDistinctiveFilter(headings, cfg);
21
+ expect(filter.size).toBeGreaterThan(0);
22
+ expect(filter.has("lifecycle")).toBe(true);
23
+ });
24
+ it("drops universal stopwords", () => {
25
+ // WHY: stopwords are common-by-design; counting them would falsely promote
26
+ // pairs that merely both say "the".
27
+ const headings = ["The Overview", "A Summary", "An Introduction"];
28
+ const filter = buildDistinctiveFilter(headings, cfg);
29
+ for (const stop of ["the", "a", "an"]) {
30
+ expect(filter.has(stop)).toBe(false);
31
+ }
32
+ });
33
+ it("drops tokens shorter than the min token length (3)", () => {
34
+ // WHY: 2-char tokens like "go" are too generic to be a reliable signal.
35
+ const headings = ["Go API", "Go CLI", "Go SDK"];
36
+ const filter = buildDistinctiveFilter(headings, cfg);
37
+ expect(filter.has("go")).toBe(false);
38
+ });
39
+ });
40
+ describe("distinctiveTokensOf", () => {
41
+ const cfg = defaultConfig().Analyzer;
42
+ it("returns only the heading tokens that are in the distinctive filter", () => {
43
+ const headings = ["Order Lifecycle", "Payment Processing", "Order Management"];
44
+ const filter = buildDistinctiveFilter(headings, cfg);
45
+ const tokens = distinctiveTokensOf("Order Lifecycle", filter, cfg);
46
+ // "lifecycle" is distinctive (freq=1 <= threshold=3) and present in heading.
47
+ expect(tokens.has("lifecycle")).toBe(true);
48
+ });
49
+ });
@@ -0,0 +1,63 @@
1
+ // Ported from internal/dedup/analyzer/exact_clusters.go.
2
+ //
3
+ // L5-exact clustering: buckets blocks by ContentHash and emits a Cluster for
4
+ // every hash that appears verbatim across >=2 distinct files. Blocks duplicated
5
+ // only within one file are skipped (mirrors L3's same-file skip).
6
+ import { cmpStr, cmpNum } from "./order.js";
7
+ /**
8
+ * exactClusters buckets the given blocks by ContentHash and returns one Cluster
9
+ * per bucket that spans >=2 distinct FilePaths.
10
+ *
11
+ * Each emitted Cluster has:
12
+ * - Exact: true, Similarity: 1.0, Informational: false
13
+ * - Kind and ContentHash from the bucket
14
+ * - Locations sorted by (FilePath, StartLine)
15
+ *
16
+ * The returned slice is sorted by ContentHash for deterministic output.
17
+ */
18
+ export function exactClusters(blocks) {
19
+ const buckets = new Map();
20
+ for (const b of blocks) {
21
+ let bkt = buckets.get(b.ContentHash);
22
+ if (!bkt) {
23
+ bkt = { kind: b.Kind, locations: [] };
24
+ buckets.set(b.ContentHash, bkt);
25
+ }
26
+ bkt.locations.push({
27
+ FilePath: b.FilePath,
28
+ Heading: b.Heading,
29
+ StartLine: b.StartLine,
30
+ EndLine: b.EndLine,
31
+ });
32
+ }
33
+ const clusters = [];
34
+ for (const [hash, bkt] of buckets) {
35
+ // Check for >=2 distinct FilePaths.
36
+ const seen = new Set();
37
+ for (const loc of bkt.locations) {
38
+ seen.add(loc.FilePath);
39
+ }
40
+ if (seen.size < 2) {
41
+ continue;
42
+ }
43
+ // Sort locations by (FilePath, StartLine) for determinism.
44
+ const locs = bkt.locations.slice();
45
+ locs.sort((a, b) => {
46
+ const c = cmpStr(a.FilePath, b.FilePath);
47
+ if (c !== 0)
48
+ return c;
49
+ return cmpNum(a.StartLine, b.StartLine);
50
+ });
51
+ clusters.push({
52
+ Kind: bkt.kind,
53
+ ContentHash: hash,
54
+ Similarity: 1.0,
55
+ Exact: true,
56
+ Informational: false,
57
+ Locations: locs,
58
+ });
59
+ }
60
+ // Sort clusters by ContentHash for deterministic output.
61
+ clusters.sort((a, b) => cmpStr(a.ContentHash, b.ContentHash));
62
+ return clusters;
63
+ }
@@ -0,0 +1,81 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { exactClusters } from "./exact_clusters.js";
3
+ // makeBlock builds a minimal BlockRecord for exactClusters tests.
4
+ function makeBlock(filePath, heading, contentHash, kind, startLine, endLine) {
5
+ return {
6
+ SectionID: "",
7
+ FilePath: filePath,
8
+ Heading: heading,
9
+ Index: 0,
10
+ Kind: kind,
11
+ StartLine: startLine,
12
+ EndLine: endLine,
13
+ ContentHash: contentHash,
14
+ Text: "",
15
+ TableRows: 0,
16
+ };
17
+ }
18
+ // exactClusters is the L5-exact pass: it must surface a block that appears
19
+ // VERBATIM across >=2 files, and must NOT surface one duplicated only within a
20
+ // single file (that is intra-file structure, not a cross-doc duplication
21
+ // problem). Determinism (sorted by ContentHash, locations sorted) is part of the
22
+ // contract because the report and the dedup index consume the order.
23
+ describe("exactClusters", () => {
24
+ it("clusters cross-file verbatim blocks, skips same-file-only duplicates, and sorts deterministically", () => {
25
+ const proseHash = "prose-hash-aaaa";
26
+ const tableHash = "table-hash-bbbb";
27
+ const sameFileHash = "same-file-hash-c";
28
+ const blocks = [
29
+ // (a) prose block shared across docs/a.md and docs/b.md
30
+ makeBlock("docs/a.md", "Overview", proseHash, "prose", 5, 10),
31
+ makeBlock("docs/b.md", "Summary", proseHash, "prose", 12, 17),
32
+ // (b) prose duplicated only within docs/a.md — must NOT cluster
33
+ makeBlock("docs/a.md", "Details", sameFileHash, "prose", 20, 25),
34
+ makeBlock("docs/a.md", "Appendix", sameFileHash, "prose", 30, 35),
35
+ // (c) table block shared across docs/a.md and docs/c.md
36
+ makeBlock("docs/a.md", "Data", tableHash, "table", 40, 50),
37
+ makeBlock("docs/c.md", "Data", tableHash, "table", 3, 13),
38
+ ];
39
+ const clusters = exactClusters(blocks);
40
+ // (b) same-file-only duplicate must NOT appear.
41
+ expect(clusters.some((cl) => cl.ContentHash === sameFileHash)).toBe(false);
42
+ // Exactly two clusters: prose + table.
43
+ expect(clusters).toHaveLength(2);
44
+ // (d) sorted by ContentHash: proseHash < tableHash lexicographically.
45
+ expect(clusters[0].ContentHash).toBe(proseHash);
46
+ expect(clusters[1].ContentHash).toBe(tableHash);
47
+ // (a) prose cluster properties.
48
+ const prose = clusters[0];
49
+ expect(prose.Exact).toBe(true);
50
+ expect(prose.Similarity).toBe(1.0);
51
+ expect(prose.Kind).toBe("prose");
52
+ expect(prose.Informational).toBe(false);
53
+ expect(prose.Locations).toHaveLength(2);
54
+ // Locations sorted by (FilePath, StartLine): a.md:5 before b.md:12.
55
+ expect(prose.Locations[0]).toMatchObject({ FilePath: "docs/a.md", StartLine: 5 });
56
+ expect(prose.Locations[1]).toMatchObject({ FilePath: "docs/b.md", StartLine: 12 });
57
+ // (c) table cluster properties.
58
+ const table = clusters[1];
59
+ expect(table.Exact).toBe(true);
60
+ expect(table.Kind).toBe("table");
61
+ expect(table.Locations).toHaveLength(2);
62
+ expect(table.Locations[0].FilePath).toBe("docs/a.md");
63
+ expect(table.Locations[1].FilePath).toBe("docs/c.md");
64
+ });
65
+ it("emits all locations of a {A,A,B} bucket (2 distinct files qualifies)", () => {
66
+ const hash = "shared-hash-xxxx";
67
+ const blocks = [
68
+ makeBlock("docs/a.md", "S1", hash, "prose", 1, 5),
69
+ makeBlock("docs/a.md", "S2", hash, "prose", 10, 14),
70
+ makeBlock("docs/b.md", "S3", hash, "prose", 2, 6),
71
+ ];
72
+ const clusters = exactClusters(blocks);
73
+ expect(clusters).toHaveLength(1);
74
+ const cl = clusters[0];
75
+ expect(cl.Locations).toHaveLength(3);
76
+ // Sorted: a.md:1, a.md:10, b.md:2.
77
+ expect(cl.Locations[0]).toMatchObject({ FilePath: "docs/a.md", StartLine: 1 });
78
+ expect(cl.Locations[1]).toMatchObject({ FilePath: "docs/a.md", StartLine: 10 });
79
+ expect(cl.Locations[2].FilePath).toBe("docs/b.md");
80
+ });
81
+ });
@@ -0,0 +1,14 @@
1
+ // Barrel for the analyzer package: the layered duplicate-detection algorithm.
2
+ //
3
+ // The public surface is the `analyze` orchestrator plus the canonical-selection,
4
+ // distinctive-token, differentiator, and preview primitives it composes (all of
5
+ // which Go exports for white-box tests and for reuse by the dedup facade).
6
+ export { analyze } from "./analyzer.js";
7
+ export { canonicalRank, isDisqualified, lessRank, pathPriorityRank, } from "./canonical.js";
8
+ export { buildDistinctiveFilter, distinctiveTokensOf } from "./distinctive.js";
9
+ export { findDifferentiators } from "./safety.js";
10
+ export { computePreview } from "./preview.js";
11
+ export { exactClusters } from "./exact_clusters.js";
12
+ export { cosineClusters } from "./cosine_clusters.js";
13
+ export { applyMultiplicity, suppressKnownGroups } from "./multiplicity.js";
14
+ export { partialOverlaps } from "./partial_overlaps.js";