docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/README.md +242 -0
  2. package/dist/apispec/apispec.js +401 -0
  3. package/dist/apispec/apispec.test.js +444 -0
  4. package/dist/apispec/errors.js +17 -0
  5. package/dist/apispec/index.js +2 -0
  6. package/dist/check/doclinks.js +167 -0
  7. package/dist/check/index.js +8 -0
  8. package/dist/check/run.js +391 -0
  9. package/dist/check/run.test.js +513 -0
  10. package/dist/check/suggest.js +134 -0
  11. package/dist/check/suggest.test.js +92 -0
  12. package/dist/check/tokens.js +125 -0
  13. package/dist/cmd/main.js +330 -0
  14. package/dist/cmd/main.test.js +422 -0
  15. package/dist/codeq/cache.js +71 -0
  16. package/dist/codeq/cache.test.js +67 -0
  17. package/dist/codeq/errors.js +52 -0
  18. package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
  19. package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
  20. package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
  21. package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
  22. package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
  23. package/dist/codeq/index.js +11 -0
  24. package/dist/codeq/resolve.test.js +109 -0
  25. package/dist/codeq/resolver.js +128 -0
  26. package/dist/codeq/resolver.test.js +124 -0
  27. package/dist/codeq/resolvers/go.js +242 -0
  28. package/dist/codeq/resolvers/go.test.js +143 -0
  29. package/dist/codeq/resolvers/java.js +349 -0
  30. package/dist/codeq/resolvers/java.test.js +138 -0
  31. package/dist/codeq/resolvers/java_queries.js +63 -0
  32. package/dist/codeq/resolvers/javascript.js +412 -0
  33. package/dist/codeq/resolvers/javascript.test.js +125 -0
  34. package/dist/codeq/resolvers/javascript_queries.js +46 -0
  35. package/dist/codeq/resolvers/typescript.js +366 -0
  36. package/dist/codeq/resolvers/typescript.test.js +180 -0
  37. package/dist/codeq/resolvers/typescript_queries.js +78 -0
  38. package/dist/codeq/signature.js +50 -0
  39. package/dist/codeq/signature.test.js +50 -0
  40. package/dist/codeq/suggest.js +96 -0
  41. package/dist/codeq/treesitter.js +122 -0
  42. package/dist/codeq/treesitter.test.js +118 -0
  43. package/dist/config/config.js +74 -0
  44. package/dist/config/config.test.js +98 -0
  45. package/dist/config/fs.js +116 -0
  46. package/dist/config/glob.js +82 -0
  47. package/dist/config/glob.test.js +61 -0
  48. package/dist/config/index.js +4 -0
  49. package/dist/dedup/analyzer/analyzer.js +533 -0
  50. package/dist/dedup/analyzer/analyzer.test.js +530 -0
  51. package/dist/dedup/analyzer/canonical.js +74 -0
  52. package/dist/dedup/analyzer/canonical.test.js +70 -0
  53. package/dist/dedup/analyzer/cosine_clusters.js +169 -0
  54. package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
  55. package/dist/dedup/analyzer/distinctive.js +85 -0
  56. package/dist/dedup/analyzer/distinctive.test.js +49 -0
  57. package/dist/dedup/analyzer/exact_clusters.js +63 -0
  58. package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
  59. package/dist/dedup/analyzer/index.js +14 -0
  60. package/dist/dedup/analyzer/multiplicity.js +110 -0
  61. package/dist/dedup/analyzer/multiplicity.test.js +123 -0
  62. package/dist/dedup/analyzer/order.js +22 -0
  63. package/dist/dedup/analyzer/partial_overlaps.js +65 -0
  64. package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
  65. package/dist/dedup/analyzer/preview.js +84 -0
  66. package/dist/dedup/analyzer/preview.test.js +46 -0
  67. package/dist/dedup/analyzer/safety.js +27 -0
  68. package/dist/dedup/analyzer/safety.test.js +39 -0
  69. package/dist/dedup/config.js +18 -0
  70. package/dist/dedup/configload.js +299 -0
  71. package/dist/dedup/configload.test.js +410 -0
  72. package/dist/dedup/dedup.index.test.js +203 -0
  73. package/dist/dedup/dedup.js +143 -0
  74. package/dist/dedup/dedup.test.js +212 -0
  75. package/dist/dedup/dedupcfg/config.js +112 -0
  76. package/dist/dedup/dedupcfg/config.test.js +70 -0
  77. package/dist/dedup/dedupcfg/index.js +1 -0
  78. package/dist/dedup/deduptypes/index.js +1 -0
  79. package/dist/dedup/deduptypes/types.js +9 -0
  80. package/dist/dedup/deduptypes/types.test.js +34 -0
  81. package/dist/dedup/embedder/cache.js +23 -0
  82. package/dist/dedup/embedder/cache.test.js +50 -0
  83. package/dist/dedup/embedder/constants.js +10 -0
  84. package/dist/dedup/embedder/embedder.js +76 -0
  85. package/dist/dedup/embedder/embedder.mock.test.js +128 -0
  86. package/dist/dedup/embedder/embedder.test.js +96 -0
  87. package/dist/dedup/embedder/errors.js +20 -0
  88. package/dist/dedup/embedder/errors.test.js +35 -0
  89. package/dist/dedup/embedder/index.js +4 -0
  90. package/dist/dedup/embedder/session.js +78 -0
  91. package/dist/dedup/embedder/session.test.js +172 -0
  92. package/dist/dedup/gitignore.js +97 -0
  93. package/dist/dedup/gitignore.test.js +98 -0
  94. package/dist/dedup/index.js +11 -0
  95. package/dist/dedup/indexdb/errors.js +48 -0
  96. package/dist/dedup/indexdb/index.js +6 -0
  97. package/dist/dedup/indexdb/indexdb.js +302 -0
  98. package/dist/dedup/indexdb/indexdb.test.js +739 -0
  99. package/dist/dedup/indexdb/load.js +110 -0
  100. package/dist/dedup/indexdb/migrations.js +58 -0
  101. package/dist/dedup/indexdb/schema.js +83 -0
  102. package/dist/dedup/indexer/index.js +9 -0
  103. package/dist/dedup/indexer/indexer.js +501 -0
  104. package/dist/dedup/indexer/indexer.test.js +510 -0
  105. package/dist/dedup/indexer/links.js +89 -0
  106. package/dist/dedup/mdsection/anchor.js +60 -0
  107. package/dist/dedup/mdsection/anchor.test.js +39 -0
  108. package/dist/dedup/mdsection/blocks.js +409 -0
  109. package/dist/dedup/mdsection/blocks.test.js +359 -0
  110. package/dist/dedup/mdsection/index.js +4 -0
  111. package/dist/dedup/mdsection/parse.js +21 -0
  112. package/dist/dedup/mdsection/section.js +234 -0
  113. package/dist/dedup/mdsection/section.test.js +221 -0
  114. package/dist/dedup/report/floatfmt.js +71 -0
  115. package/dist/dedup/report/floatfmt.test.js +42 -0
  116. package/dist/dedup/report/index.js +8 -0
  117. package/dist/dedup/report/quote.js +77 -0
  118. package/dist/dedup/report/quote.test.js +67 -0
  119. package/dist/dedup/report/text.js +251 -0
  120. package/dist/dedup/report/text.test.js +420 -0
  121. package/dist/dedup/report_types.js +8 -0
  122. package/dist/dedup/sectionid/index.js +1 -0
  123. package/dist/dedup/sectionid/sectionid.js +16 -0
  124. package/dist/dedup/sectionid/sectionid.test.js +49 -0
  125. package/dist/guard/api/errors.js +12 -0
  126. package/dist/guard/api/index.js +2 -0
  127. package/dist/guard/api/parser.js +81 -0
  128. package/dist/guard/api/parser.test.js +58 -0
  129. package/dist/guard/api/types.js +1 -0
  130. package/dist/guard/code/errors.js +16 -0
  131. package/dist/guard/code/index.js +2 -0
  132. package/dist/guard/code/parser.js +54 -0
  133. package/dist/guard/code/parser.test.js +111 -0
  134. package/dist/guard/code/types.js +6 -0
  135. package/dist/index.js +1 -0
  136. package/dist/index.test.js +5 -0
  137. package/dist/repo/boundary.js +92 -0
  138. package/dist/repo/boundary.test.js +65 -0
  139. package/dist/repo/errors.js +56 -0
  140. package/dist/repo/errors.test.js +85 -0
  141. package/dist/repo/exists.test.js +72 -0
  142. package/dist/repo/filename.js +46 -0
  143. package/dist/repo/filename.test.js +39 -0
  144. package/dist/repo/fs.js +53 -0
  145. package/dist/repo/index.js +7 -0
  146. package/dist/repo/overlay.js +36 -0
  147. package/dist/repo/overlay.test.js +80 -0
  148. package/dist/repo/repo.js +353 -0
  149. package/dist/repo/repo.test.js +255 -0
  150. package/dist/repo/testutil.js +27 -0
  151. package/dist/repo/write.test.js +125 -0
  152. package/dist/report/color.js +73 -0
  153. package/dist/report/index.js +1 -0
  154. package/dist/report/report.js +112 -0
  155. package/dist/report/report.test.js +368 -0
  156. package/dist/violation/index.js +1 -0
  157. package/dist/violation/types.js +22 -0
  158. package/dist/violation/types.test.js +70 -0
  159. package/package.json +48 -0
@@ -0,0 +1,251 @@
1
+ // Renders a dedup Report as plain text. Ported from
2
+ // internal/dedup/report/text.go.
3
+ //
4
+ // Render returns the structured Report as a string. The Go signature was
5
+ // Render(w io.Writer, r, cfg) error, accumulating the first write error; here
6
+ // we build the output in memory and return it (matching the existing TS report
7
+ // port in ../../report, which also returns a string). The only Go caller wrote
8
+ // to os.Stdout, which the TS CLI does as process.stdout.write(Render(...)). The
9
+ // in-memory builder never fails, so there is no error return.
10
+ //
11
+ // The output format is byte-matched to the Go binary's output (and ultimately
12
+ // the Python POC's dedup.py output): the golden-test oracle compares it
13
+ // byte-for-byte. No color/styling is involved — this is plain text, so there is
14
+ // no TTY-dependent path to guard.
15
+ import { QuoteHeading } from "./quote.js";
16
+ import { formatFloat2 } from "./floatfmt.js";
17
+ /**
18
+ * Render returns the duplicate-detection report as text, using cfg for
19
+ * formatting (separator, wrap columns). Mirrors Go's report.Render.
20
+ */
21
+ export function Render(r, cfg) {
22
+ if (r.HighGroups.length === 0 &&
23
+ r.MaybePairs.length === 0 &&
24
+ r.PartialOverlaps.length === 0) {
25
+ return "No duplicate documentation concepts found.\n";
26
+ }
27
+ const p = new Printer(cfg);
28
+ // Header.
29
+ p.printf(`Duplicate documentation concepts found: ${r.HighGroups.length}\n`);
30
+ if (r.MaybePairs.length > 0) {
31
+ p.printf(`Possible pairs needing review: ${r.MaybePairs.length}\n`);
32
+ }
33
+ p.print("\n");
34
+ const sep = p.cfg.separator + "\n\n";
35
+ // High-confidence groups.
36
+ for (let gi = 0; gi < r.HighGroups.length; gi++) {
37
+ if (gi > 0) {
38
+ p.print(sep);
39
+ }
40
+ p.renderGroup(gi + 1, r.HighGroups[gi]);
41
+ }
42
+ // Possible pairs.
43
+ if (r.MaybePairs.length > 0) {
44
+ if (r.HighGroups.length > 0) {
45
+ p.print(sep);
46
+ }
47
+ p.print("Possible pairs (manual review)\n\n");
48
+ for (let pi = 0; pi < r.MaybePairs.length; pi++) {
49
+ if (pi > 0) {
50
+ p.print(sep);
51
+ }
52
+ p.renderPair(pi + 1, r.MaybePairs[pi]);
53
+ }
54
+ }
55
+ // Partial overlaps (L5 block-level findings).
56
+ if (r.PartialOverlaps.length > 0) {
57
+ if (r.HighGroups.length > 0 || r.MaybePairs.length > 0) {
58
+ p.print(sep);
59
+ }
60
+ p.print("Partial overlaps (manual review)\n\n");
61
+ for (let ci = 0; ci < r.PartialOverlaps.length; ci++) {
62
+ if (ci > 0) {
63
+ p.print(sep);
64
+ }
65
+ p.renderCluster(ci + 1, r.PartialOverlaps[ci]);
66
+ }
67
+ }
68
+ return p.out;
69
+ }
70
+ // Printer accumulates the rendered output. (Go: printer over an io.Writer.)
71
+ class Printer {
72
+ out = "";
73
+ cfg;
74
+ constructor(cfg) {
75
+ this.cfg = cfg;
76
+ }
77
+ print(s) {
78
+ this.out += s;
79
+ }
80
+ printf(s) {
81
+ this.out += s;
82
+ }
83
+ renderGroup(n, grp) {
84
+ this.printf(`[Group ${n}] ${grp.Canonical.Heading}\n`);
85
+ this.printf(`Confidence: ${grp.Confidence}\n`);
86
+ this.printf(`Recommended action: ${grp.Action}\n\n`);
87
+ this.print("Canonical candidate:\n");
88
+ this.printf(` ${grp.Canonical.FilePath}#${grp.Canonical.Anchor}\n`);
89
+ this.printf(` (lines ${grp.Canonical.StartLine}-${grp.Canonical.EndLine})\n`);
90
+ if (grp.Canonical.InboundCount > 0) {
91
+ this.printf(` inbound links: ${grp.Canonical.InboundCount}\n`);
92
+ }
93
+ this.emitPreview(grp.Canonical.Preview, " ");
94
+ this.print("\n");
95
+ this.print("Duplicate sections:\n");
96
+ for (let mi = 0; mi < grp.Members.length; mi++) {
97
+ const m = grp.Members[mi];
98
+ this.printf(` ${mi + 1}. ${m.FilePath}#${m.Anchor}\n`);
99
+ this.printf(` Heading: ${m.Heading}\n`);
100
+ this.printf(` Similarity: ${formatFloat2(m.Similarity)}\n`);
101
+ if (m.ExactMatch) {
102
+ this.print(" (prose is byte-identical to canonical after normalization)\n");
103
+ }
104
+ this.print(" Reason:\n");
105
+ for (const reason of m.Reasons) {
106
+ this.printf(` - ${reason}\n`);
107
+ }
108
+ this.emitPreview(m.Preview, " ");
109
+ }
110
+ this.print("\n");
111
+ this.print("Suggested replacement:\n");
112
+ this.printf(` Keep the canonical section in ${grp.Canonical.FilePath}.\n`);
113
+ this.print(" Replace duplicated sections with short summaries and links to:\n");
114
+ this.printf(` ${grp.Canonical.FilePath}#${grp.Canonical.Anchor}\n\n`);
115
+ }
116
+ renderPair(n, pair) {
117
+ this.printf(`[Pair ${n}] similarity ${formatFloat2(pair.Similarity)}\n`);
118
+ // canonical line (10 chars for " canonical? ").
119
+ this.printf(` canonical? ${pair.Canonical.FilePath}#${pair.Canonical.Anchor}\n`);
120
+ // heading repr: exactly 14 spaces indent.
121
+ this.printf(` (${QuoteHeading(pair.Canonical.Heading)})\n`);
122
+ this.emitPreview(pair.Canonical.Preview, " ");
123
+ this.printf(` candidate ${pair.Candidate.FilePath}#${pair.Candidate.Anchor}\n`);
124
+ this.printf(` (${QuoteHeading(pair.Candidate.Heading)})\n`);
125
+ this.emitPreview(pair.Candidate.Preview, " ");
126
+ for (const reason of pair.Reasons) {
127
+ this.printf(` - ${reason}\n`);
128
+ }
129
+ this.print("\n");
130
+ }
131
+ renderCluster(n, cl) {
132
+ // Header: kind + exact/cosine indicator.
133
+ if (cl.Exact) {
134
+ this.printf(`[Cluster ${n}] ${cl.Kind} identical (verbatim)\n`);
135
+ }
136
+ else {
137
+ this.printf(`[Cluster ${n}] ${cl.Kind} cosine=${formatFloat2(cl.Similarity)}\n`);
138
+ }
139
+ // Informational tag: distinct file count from locations.
140
+ if (cl.Informational) {
141
+ const fileSet = new Set();
142
+ for (const loc of cl.Locations) {
143
+ fileSet.add(loc.FilePath);
144
+ }
145
+ this.printf(` appears in ${fileSet.size} files — likely intentional; consider a shared snippet\n`);
146
+ }
147
+ // Each location.
148
+ for (const loc of cl.Locations) {
149
+ this.printf(` ${loc.FilePath} (lines ${loc.StartLine}-${loc.EndLine})\n`);
150
+ this.printf(` Heading: ${loc.Heading}\n`);
151
+ }
152
+ this.print("\n");
153
+ }
154
+ // emitPreview writes the word-wrapped preview lines.
155
+ // If preview is empty, writes "(no prose content)".
156
+ emitPreview(preview, indent) {
157
+ if (preview === "") {
158
+ this.printf(`${indent}Preview: (no prose content)\n`);
159
+ return;
160
+ }
161
+ // Word-wrap at cfg.wrap_cols columns with the initial indent including
162
+ // "Preview: ".
163
+ const initialIndent = indent + "Preview: ";
164
+ const subseqIndent = indent + " ";
165
+ const lines = wordWrap(preview, this.cfg.wrap_cols, initialIndent, subseqIndent);
166
+ for (const line of lines) {
167
+ this.printf(`${line}\n`);
168
+ }
169
+ }
170
+ }
171
+ // wordWrap wraps text at width columns, matching Python's textwrap.wrap and the
172
+ // Go port's behaviour. initialIndent is prepended to the first line,
173
+ // subsequentIndent to subsequent lines.
174
+ //
175
+ // Byte-for-byte note: Go measures width in BYTES (len() over UTF-8), not runes
176
+ // or UTF-16 units. byteLen() below replicates that so multi-byte previews wrap
177
+ // at the same column boundaries Go produces.
178
+ export function wordWrap(text, width, initialIndent, subseqIndent) {
179
+ const words = fields(text);
180
+ if (words.length === 0) {
181
+ return [];
182
+ }
183
+ const lines = [];
184
+ let currentIndent = initialIndent;
185
+ let sb = currentIndent;
186
+ let lineLen = byteLen(currentIndent);
187
+ for (let i = 0; i < words.length; i++) {
188
+ const w = words[i];
189
+ let addLen = byteLen(w);
190
+ if (i > 0) {
191
+ addLen++; // space
192
+ }
193
+ if (lineLen > byteLen(currentIndent) && lineLen + addLen > width) {
194
+ // Flush line and start new.
195
+ lines.push(sb);
196
+ currentIndent = subseqIndent;
197
+ sb = currentIndent;
198
+ lineLen = byteLen(currentIndent);
199
+ sb += w;
200
+ lineLen += byteLen(w);
201
+ }
202
+ else {
203
+ if (lineLen > byteLen(currentIndent)) {
204
+ sb += " ";
205
+ lineLen++;
206
+ }
207
+ sb += w;
208
+ lineLen += byteLen(w);
209
+ }
210
+ }
211
+ if (sb.length > 0) {
212
+ lines.push(sb);
213
+ }
214
+ return lines;
215
+ }
216
+ // byteLen returns the UTF-8 byte length of s, matching Go's len(string).
217
+ function byteLen(s) {
218
+ return Buffer.byteLength(s, "utf8");
219
+ }
220
+ // goSpace is exactly Go's unicode.IsSpace codepoint set. It differs from JS's
221
+ // regex \s by including U+0085 (NEL) and excluding U+FEFF (ZWNBSP/BOM), so we
222
+ // match Go's strings.Fields tokenization rather than relying on \s.
223
+ const goSpace = new Set([
224
+ 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x20, 0x85, 0xa0, 0x1680, 0x2000, 0x2001,
225
+ 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a,
226
+ 0x2028, 0x2029, 0x202f, 0x205f, 0x3000,
227
+ ]);
228
+ // fields splits s around runs of Go-whitespace, dropping empty tokens.
229
+ // Equivalent to Go's strings.Fields. Iterates by codepoint so astral whitespace
230
+ // (none in goSpace today, but the iteration is codepoint-correct regardless) is
231
+ // handled as a single separator.
232
+ function fields(s) {
233
+ const out = [];
234
+ let cur = "";
235
+ for (const ch of s) {
236
+ const c = ch.codePointAt(0);
237
+ if (goSpace.has(c)) {
238
+ if (cur !== "") {
239
+ out.push(cur);
240
+ cur = "";
241
+ }
242
+ }
243
+ else {
244
+ cur += ch;
245
+ }
246
+ }
247
+ if (cur !== "") {
248
+ out.push(cur);
249
+ }
250
+ return out;
251
+ }
@@ -0,0 +1,420 @@
1
+ // Behavior-encoding tests for the dedup report renderer, ported from
2
+ // internal/dedup/report/text_test.go (and quote.go's locked rules).
3
+ //
4
+ // WHY this output is pinned so tightly: Render's text is compared BYTE-FOR-BYTE
5
+ // by the dedup golden-test oracle against the Go binary's output. A drift in
6
+ // section ordering, separators, indentation, the %.2f similarity rounding, the
7
+ // QuoteHeading repr, or the wrap math would silently break oracle parity. Each
8
+ // test below encodes one such invariant and asserts the exact string, not a
9
+ // loose contains() where the Go test was loose only because Go could not easily
10
+ // assert the whole stanza.
11
+ import { describe, it, expect } from "vitest";
12
+ import { Render, QuoteHeading } from "./index.js";
13
+ import { Default } from "../config.js";
14
+ const cfg = Default().Report;
15
+ // emptyReport is the zero-value Report (Go: &dedup.Report{}). All three slices
16
+ // are empty, exercising the "no findings" path.
17
+ function emptyReport() {
18
+ return { HighGroups: [], MaybePairs: [], PartialOverlaps: [] };
19
+ }
20
+ // makeSimpleMember mirrors the Go fixture of the same name.
21
+ function makeSimpleMember(id, fp, anchor, heading, exact) {
22
+ return {
23
+ SectionID: id,
24
+ FilePath: fp,
25
+ Anchor: anchor,
26
+ Heading: heading,
27
+ StartLine: 1,
28
+ EndLine: 10,
29
+ InboundCount: 0,
30
+ Similarity: 0.95,
31
+ Reasons: ["high semantic similarity (cosine=0.950)"],
32
+ ExactMatch: exact,
33
+ Preview: "This is the preview content of the section.",
34
+ };
35
+ }
36
+ function makeSimpleHighReport() {
37
+ const canonical = makeSimpleMember("id1", "docs/concepts/a.md", "lifecycle", "Order Lifecycle", false);
38
+ canonical.Similarity = 1.0;
39
+ const member = makeSimpleMember("id2", "docs/design/b.md", "lifecycle", "Order Lifecycle", true);
40
+ return {
41
+ HighGroups: [
42
+ {
43
+ Canonical: canonical,
44
+ Members: [member],
45
+ Confidence: "high",
46
+ Action: "replace_with_reference",
47
+ },
48
+ ],
49
+ MaybePairs: [],
50
+ PartialOverlaps: [],
51
+ };
52
+ }
53
+ function makeSimpleMaybeReport() {
54
+ const canonical = makeSimpleMember("id1", "docs/concepts/a.md", "lifecycle", "Order Lifecycle", false);
55
+ canonical.Similarity = 1.0;
56
+ const candidate = makeSimpleMember("id2", "docs/design/b.md", "lifecycle-2", "Order Lifecycle Guide", false);
57
+ candidate.Similarity = 0.89;
58
+ return {
59
+ HighGroups: [],
60
+ MaybePairs: [
61
+ {
62
+ Canonical: canonical,
63
+ Candidate: candidate,
64
+ Similarity: 0.89,
65
+ Reasons: ["possible semantic similarity (cosine=0.890)"],
66
+ },
67
+ ],
68
+ PartialOverlaps: [],
69
+ };
70
+ }
71
+ // makeReportWithGroupsAndPairs returns a Report with nGroups HIGH groups and
72
+ // nPairs MAYBE pairs. Mirrors the Go fixture (ids use 'a'+i suffixes).
73
+ function makeReportWithGroupsAndPairs(nGroups, nPairs) {
74
+ const r = { HighGroups: [], MaybePairs: [], PartialOverlaps: [] };
75
+ for (let i = 0; i < nGroups; i++) {
76
+ const suffix = String.fromCharCode("a".charCodeAt(0) + i);
77
+ const canonical = makeSimpleMember("can" + suffix, "docs/concepts/file.md", "section", "Section Heading", false);
78
+ canonical.Similarity = 1.0;
79
+ const member = makeSimpleMember("mem" + suffix, "docs/design/file.md", "section", "Section Heading", false);
80
+ r.HighGroups.push({
81
+ Canonical: canonical,
82
+ Members: [member],
83
+ Confidence: "high",
84
+ Action: "replace_with_reference",
85
+ });
86
+ }
87
+ for (let i = 0; i < nPairs; i++) {
88
+ const suffix = String.fromCharCode("a".charCodeAt(0) + i);
89
+ const canonical = makeSimpleMember("pcan" + suffix, "docs/concepts/file.md", "pair-section", "Pair Section", false);
90
+ canonical.Similarity = 1.0;
91
+ const candidate = makeSimpleMember("pcand" + suffix, "docs/design/file.md", "pair-section", "Pair Section Variant", false);
92
+ candidate.Similarity = 0.89;
93
+ r.MaybePairs.push({
94
+ Canonical: canonical,
95
+ Candidate: candidate,
96
+ Similarity: 0.89,
97
+ Reasons: ["possible semantic similarity (cosine=0.890)"],
98
+ });
99
+ }
100
+ return r;
101
+ }
102
+ // ---------- QuoteHeading ----------
103
+ // WHY: the heading repr is embedded in pair stanzas and is part of the
104
+ // byte-exact output. Each case locks one escaping rule from quote.go's contract.
105
+ describe("QuoteHeading", () => {
106
+ it("ascii with no specials is wrapped in single quotes only", () => {
107
+ // No escapes -> just surrounding quotes; a regression that over-escaped
108
+ // would corrupt every plain heading in the report.
109
+ expect(QuoteHeading("Order Lifecycle")).toBe("'Order Lifecycle'");
110
+ });
111
+ it("escapes an embedded single quote", () => {
112
+ // The wrapping quote is single-quote, so an inner ' must be backslashed or
113
+ // the repr would be unbalanced.
114
+ expect(QuoteHeading("It's a heading")).toBe("'It\\'s a heading'");
115
+ });
116
+ it("escapes backslashes", () => {
117
+ // A literal backslash must double so the repr round-trips like Python's.
118
+ expect(QuoteHeading("Path\\To\\File")).toBe("'Path\\\\To\\\\File'");
119
+ });
120
+ it("escapes control chars below 0x20 (tab -> \\t)", () => {
121
+ // Control chars are escaped so a stray tab cannot disturb the fixed-column
122
+ // layout of the stanza.
123
+ expect(QuoteHeading("Head\ting")).toBe("'Head\\ting'");
124
+ });
125
+ it("escapes DEL (0x7f) as \\x7f", () => {
126
+ // DEL is the one >= 0x20 ASCII char that is still escaped.
127
+ expect(QuoteHeading("Head\x7fing")).toBe("'Head\\x7fing'");
128
+ });
129
+ it("passes CJK through literally, NOT as \\uXXXX", () => {
130
+ // The locked rule (plan §366): non-ASCII bytes are verbatim so CJK docs read
131
+ // naturally. Escaping them would diverge from the Go/POC output byte-for-byte.
132
+ expect(QuoteHeading("補貨單流程")).toBe("'補貨單流程'");
133
+ });
134
+ });
135
+ // ---------- Render: no findings ----------
136
+ describe("Render no findings", () => {
137
+ it("returns the exact single-line no-findings message", () => {
138
+ // A clean doc tree must produce this exact line (and nothing else) so the
139
+ // CLI's empty-report path stays diffable and the exit code logic upstream
140
+ // sees no groups.
141
+ expect(Render(emptyReport(), cfg)).toBe("No duplicate documentation concepts found.\n");
142
+ });
143
+ });
144
+ // ---------- Render: header counts ----------
145
+ describe("Render header", () => {
146
+ it("renders both the group count and the pairs count", () => {
147
+ const out = Render(makeReportWithGroupsAndPairs(2, 3), cfg);
148
+ expect(out).toContain("Duplicate documentation concepts found: 2");
149
+ expect(out).toContain("Possible pairs needing review: 3");
150
+ });
151
+ it("omits the pairs line when there are zero pairs", () => {
152
+ // WHY: the pairs count line is conditional; emitting "needing review: 0"
153
+ // would be noise and would not match Go.
154
+ const out = Render(makeReportWithGroupsAndPairs(1, 0), cfg);
155
+ expect(out).not.toContain("Possible pairs needing review");
156
+ });
157
+ });
158
+ // ---------- Render: group stanza ----------
159
+ describe("Render group stanza", () => {
160
+ it("contains every labelled section of a high-confidence group", () => {
161
+ const out = Render(makeSimpleHighReport(), cfg);
162
+ for (const c of [
163
+ "[Group 1]",
164
+ "Confidence: high",
165
+ "Recommended action: replace_with_reference",
166
+ "Canonical candidate:",
167
+ "Duplicate sections:",
168
+ "Suggested replacement:",
169
+ ]) {
170
+ expect(out).toContain(c);
171
+ }
172
+ });
173
+ it("separates consecutive groups with the configured separator", () => {
174
+ // The separator delimits stanzas; without it two groups would visually merge
175
+ // and the oracle diff would fail.
176
+ const out = Render(makeReportWithGroupsAndPairs(2, 0), cfg);
177
+ expect(out).toContain("\n---\n");
178
+ });
179
+ it("renders the full group stanza byte-for-byte", () => {
180
+ // The contains() checks above mirror the Go test's looseness, but the oracle
181
+ // is byte-exact — this pins the WHOLE stanza so any indentation / line-order
182
+ // / similarity-format regression fails here, not silently in the golden run.
183
+ const out = Render(makeSimpleHighReport(), cfg);
184
+ const want = "Duplicate documentation concepts found: 1\n" +
185
+ "\n" +
186
+ "[Group 1] Order Lifecycle\n" +
187
+ "Confidence: high\n" +
188
+ "Recommended action: replace_with_reference\n" +
189
+ "\n" +
190
+ "Canonical candidate:\n" +
191
+ " docs/concepts/a.md#lifecycle\n" +
192
+ " (lines 1-10)\n" +
193
+ " Preview: This is the preview content of the section.\n" +
194
+ "\n" +
195
+ "Duplicate sections:\n" +
196
+ " 1. docs/design/b.md#lifecycle\n" +
197
+ " Heading: Order Lifecycle\n" +
198
+ " Similarity: 0.95\n" +
199
+ " (prose is byte-identical to canonical after normalization)\n" +
200
+ " Reason:\n" +
201
+ " - high semantic similarity (cosine=0.950)\n" +
202
+ " Preview: This is the preview content of the section.\n" +
203
+ "\n" +
204
+ "Suggested replacement:\n" +
205
+ " Keep the canonical section in docs/concepts/a.md.\n" +
206
+ " Replace duplicated sections with short summaries and links to:\n" +
207
+ " docs/concepts/a.md#lifecycle\n" +
208
+ "\n";
209
+ expect(out).toBe(want);
210
+ });
211
+ it("emits the exact-match note when ExactMatch is set", () => {
212
+ const out = Render(makeSimpleHighReport(), cfg);
213
+ expect(out).toContain("prose is byte-identical to canonical after normalization");
214
+ });
215
+ it("never prints 'inbound links: 0' for a zero-inbound canonical", () => {
216
+ // The inbound-links line is conditional on a positive count; printing 0
217
+ // would diverge from Go.
218
+ const out = Render(makeSimpleHighReport(), cfg);
219
+ expect(out).not.toContain("inbound links: 0");
220
+ });
221
+ });
222
+ // ---------- Render: pair stanza ----------
223
+ describe("Render pair stanza", () => {
224
+ it("indents the heading repr line exactly 14 spaces", () => {
225
+ // The locked format is " ({heading!r})" with 14 leading spaces;
226
+ // this column count is part of the byte-exact contract.
227
+ const out = Render(makeSimpleMaybeReport(), cfg);
228
+ const lines = out.split("\n");
229
+ let found14 = false;
230
+ for (const line of lines) {
231
+ if (line.startsWith(" (") && line.endsWith(")")) {
232
+ found14 = true;
233
+ let spaces = 0;
234
+ for (const ch of line) {
235
+ if (ch === " ") {
236
+ spaces++;
237
+ }
238
+ else {
239
+ break;
240
+ }
241
+ }
242
+ expect(spaces).toBe(14);
243
+ }
244
+ }
245
+ expect(found14).toBe(true);
246
+ });
247
+ it("renders the heading repr via QuoteHeading byte-equal (CJK + quote cases)", () => {
248
+ // WHY: a typo in the ` (%s)` format would slip through an
249
+ // indentation-only check. Using a CJK heading and a quote-containing heading
250
+ // makes QuoteHeading's output non-trivially distinct from the raw heading.
251
+ const canonical = makeSimpleMember("id1", "docs/a.md", "lifecycle", "補貨單流程", false);
252
+ canonical.Similarity = 1.0;
253
+ const candidate = makeSimpleMember("id2", "docs/b.md", "flow", "It's the flow", false);
254
+ candidate.Similarity = 0.89;
255
+ const r = {
256
+ HighGroups: [],
257
+ MaybePairs: [
258
+ {
259
+ Canonical: canonical,
260
+ Candidate: candidate,
261
+ Similarity: 0.89,
262
+ Reasons: ["possible semantic similarity (cosine=0.890)"],
263
+ },
264
+ ],
265
+ PartialOverlaps: [],
266
+ };
267
+ const out = Render(r, cfg);
268
+ expect(out).toContain(" (" + QuoteHeading("補貨單流程") + ")");
269
+ expect(out).toContain(" (" + QuoteHeading("It's the flow") + ")");
270
+ });
271
+ it("renders the full pair stanza byte-for-byte", () => {
272
+ // Pins the whole MAYBE-pair output, including the 14-space heading indent,
273
+ // the 4-space preview indent, the "canonical?" / "candidate" labels, the
274
+ // %.2f similarity (0.89), and the reason bullet.
275
+ const out = Render(makeSimpleMaybeReport(), cfg);
276
+ const want = "Duplicate documentation concepts found: 0\n" +
277
+ "Possible pairs needing review: 1\n" +
278
+ "\n" +
279
+ "Possible pairs (manual review)\n" +
280
+ "\n" +
281
+ "[Pair 1] similarity 0.89\n" +
282
+ " canonical? docs/concepts/a.md#lifecycle\n" +
283
+ " ('Order Lifecycle')\n" +
284
+ " Preview: This is the preview content of the section.\n" +
285
+ " candidate docs/design/b.md#lifecycle-2\n" +
286
+ " ('Order Lifecycle Guide')\n" +
287
+ " Preview: This is the preview content of the section.\n" +
288
+ " - possible semantic similarity (cosine=0.890)\n" +
289
+ "\n";
290
+ expect(out).toBe(want);
291
+ });
292
+ });
293
+ // ---------- Render: PartialOverlaps (L5 clusters) ----------
294
+ describe("Render partial overlaps", () => {
295
+ it("renders an exact cluster's kind and both locations, with no cosine", () => {
296
+ const r = {
297
+ HighGroups: [],
298
+ MaybePairs: [],
299
+ PartialOverlaps: [
300
+ {
301
+ Kind: "prose",
302
+ ContentHash: "abc123",
303
+ Similarity: 1.0,
304
+ Exact: true,
305
+ Informational: false,
306
+ Locations: [
307
+ { FilePath: "docs/a.md", Heading: "Background", StartLine: 5, EndLine: 12 },
308
+ { FilePath: "docs/b.md", Heading: "Intro", StartLine: 20, EndLine: 27 },
309
+ ],
310
+ },
311
+ ],
312
+ };
313
+ const out = Render(r, cfg);
314
+ expect(out).toContain("Partial overlaps");
315
+ expect(out).toContain("prose");
316
+ expect(out).toContain("docs/a.md");
317
+ expect(out).toContain("docs/b.md");
318
+ expect(out).toContain("5");
319
+ expect(out).toContain("12");
320
+ expect(out).toContain("20");
321
+ expect(out).toContain("27");
322
+ // Exact cluster must NOT render a cosine value.
323
+ expect(out).not.toContain("cosine=");
324
+ // Non-informational cluster must NOT show the intentional tag.
325
+ expect(out).not.toContain("likely intentional");
326
+ });
327
+ it("renders the full exact cluster stanza byte-for-byte", () => {
328
+ // Pins the "identical (verbatim)" header and the per-location lines so a
329
+ // header-format or line-range regression fails here.
330
+ const r = {
331
+ HighGroups: [],
332
+ MaybePairs: [],
333
+ PartialOverlaps: [
334
+ {
335
+ Kind: "prose",
336
+ ContentHash: "abc123",
337
+ Similarity: 1.0,
338
+ Exact: true,
339
+ Informational: false,
340
+ Locations: [
341
+ { FilePath: "docs/a.md", Heading: "Background", StartLine: 5, EndLine: 12 },
342
+ { FilePath: "docs/b.md", Heading: "Intro", StartLine: 20, EndLine: 27 },
343
+ ],
344
+ },
345
+ ],
346
+ };
347
+ const out = Render(r, cfg);
348
+ const want = "Duplicate documentation concepts found: 0\n" +
349
+ "\n" +
350
+ "Partial overlaps (manual review)\n" +
351
+ "\n" +
352
+ "[Cluster 1] prose identical (verbatim)\n" +
353
+ " docs/a.md (lines 5-12)\n" +
354
+ " Heading: Background\n" +
355
+ " docs/b.md (lines 20-27)\n" +
356
+ " Heading: Intro\n" +
357
+ "\n";
358
+ expect(out).toBe(want);
359
+ });
360
+ it("renders a cosine cluster's similarity value as cosine=0.96", () => {
361
+ // WHY: 0.96 is a %.2f format check on a non-exact cluster header.
362
+ const r = {
363
+ HighGroups: [],
364
+ MaybePairs: [],
365
+ PartialOverlaps: [
366
+ {
367
+ Kind: "prose",
368
+ ContentHash: "",
369
+ Similarity: 0.96,
370
+ Exact: false,
371
+ Informational: false,
372
+ Locations: [
373
+ { FilePath: "docs/x.md", Heading: "Sec X", StartLine: 1, EndLine: 5 },
374
+ { FilePath: "docs/y.md", Heading: "Sec Y", StartLine: 10, EndLine: 14 },
375
+ ],
376
+ },
377
+ ],
378
+ };
379
+ const out = Render(r, cfg);
380
+ expect(out).toContain("cosine=0.96");
381
+ });
382
+ it("renders the informational tag with the distinct file count", () => {
383
+ // The boilerplate tag counts DISTINCT files (a Set), here 6, and includes
384
+ // the shared-snippet hint. The em dash is part of the byte-exact string.
385
+ const cl = {
386
+ Kind: "prose",
387
+ ContentHash: "def456",
388
+ Similarity: 1.0,
389
+ Exact: true,
390
+ Informational: true,
391
+ Locations: [
392
+ { FilePath: "docs/a.md", Heading: "Note", StartLine: 3, EndLine: 8 },
393
+ { FilePath: "docs/b.md", Heading: "Note", StartLine: 3, EndLine: 8 },
394
+ { FilePath: "docs/c.md", Heading: "Note", StartLine: 3, EndLine: 8 },
395
+ { FilePath: "docs/d.md", Heading: "Note", StartLine: 3, EndLine: 8 },
396
+ { FilePath: "docs/e.md", Heading: "Note", StartLine: 3, EndLine: 8 },
397
+ { FilePath: "docs/f.md", Heading: "Note", StartLine: 3, EndLine: 8 },
398
+ ],
399
+ };
400
+ const out = Render({ HighGroups: [], MaybePairs: [], PartialOverlaps: [cl] }, cfg);
401
+ expect(out).toContain("likely intentional");
402
+ expect(out).toContain("6 files");
403
+ expect(out).toContain("consider a shared snippet");
404
+ // Pin the exact informational line including the em dash.
405
+ expect(out).toContain(" appears in 6 files — likely intentional; consider a shared snippet\n");
406
+ });
407
+ it("treats an empty PartialOverlaps slice as byte-identical to none", () => {
408
+ // WHY: the section header is gated on length > 0; an explicit empty slice
409
+ // must not emit a spurious "Partial overlaps" header. The Go test compared a
410
+ // base report (no field set) against one with an explicit []; both empty
411
+ // here, so the output must be identical.
412
+ const base = makeReportWithGroupsAndPairs(1, 1);
413
+ const withEmpty = {
414
+ HighGroups: base.HighGroups,
415
+ MaybePairs: base.MaybePairs,
416
+ PartialOverlaps: [],
417
+ };
418
+ expect(Render(withEmpty, cfg)).toBe(Render(base, cfg));
419
+ });
420
+ });
@@ -0,0 +1,8 @@
1
+ // Top-level result/report types for the dedup facade.
2
+ //
3
+ // Ported from internal/dedup/report_types.go. The Go file re-declared these as
4
+ // type aliases (`type Report = deduptypes.Report`) to break the facade ↔
5
+ // analyzer import cycle; the leaf deduptypes package owns the real definitions.
6
+ // In TS the same shapes are re-exported from ../deduptypes so callers (cmd, the
7
+ // dedup/report subpackage) can import them from the package root.
8
+ export {};
@@ -0,0 +1 @@
1
+ export { derive } from "./sectionid.js";