docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/README.md +242 -0
  2. package/dist/apispec/apispec.js +401 -0
  3. package/dist/apispec/apispec.test.js +444 -0
  4. package/dist/apispec/errors.js +17 -0
  5. package/dist/apispec/index.js +2 -0
  6. package/dist/check/doclinks.js +167 -0
  7. package/dist/check/index.js +8 -0
  8. package/dist/check/run.js +391 -0
  9. package/dist/check/run.test.js +513 -0
  10. package/dist/check/suggest.js +134 -0
  11. package/dist/check/suggest.test.js +92 -0
  12. package/dist/check/tokens.js +125 -0
  13. package/dist/cmd/main.js +330 -0
  14. package/dist/cmd/main.test.js +422 -0
  15. package/dist/codeq/cache.js +71 -0
  16. package/dist/codeq/cache.test.js +67 -0
  17. package/dist/codeq/errors.js +52 -0
  18. package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
  19. package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
  20. package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
  21. package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
  22. package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
  23. package/dist/codeq/index.js +11 -0
  24. package/dist/codeq/resolve.test.js +109 -0
  25. package/dist/codeq/resolver.js +128 -0
  26. package/dist/codeq/resolver.test.js +124 -0
  27. package/dist/codeq/resolvers/go.js +242 -0
  28. package/dist/codeq/resolvers/go.test.js +143 -0
  29. package/dist/codeq/resolvers/java.js +349 -0
  30. package/dist/codeq/resolvers/java.test.js +138 -0
  31. package/dist/codeq/resolvers/java_queries.js +63 -0
  32. package/dist/codeq/resolvers/javascript.js +412 -0
  33. package/dist/codeq/resolvers/javascript.test.js +125 -0
  34. package/dist/codeq/resolvers/javascript_queries.js +46 -0
  35. package/dist/codeq/resolvers/typescript.js +366 -0
  36. package/dist/codeq/resolvers/typescript.test.js +180 -0
  37. package/dist/codeq/resolvers/typescript_queries.js +78 -0
  38. package/dist/codeq/signature.js +50 -0
  39. package/dist/codeq/signature.test.js +50 -0
  40. package/dist/codeq/suggest.js +96 -0
  41. package/dist/codeq/treesitter.js +122 -0
  42. package/dist/codeq/treesitter.test.js +118 -0
  43. package/dist/config/config.js +74 -0
  44. package/dist/config/config.test.js +98 -0
  45. package/dist/config/fs.js +116 -0
  46. package/dist/config/glob.js +82 -0
  47. package/dist/config/glob.test.js +61 -0
  48. package/dist/config/index.js +4 -0
  49. package/dist/dedup/analyzer/analyzer.js +533 -0
  50. package/dist/dedup/analyzer/analyzer.test.js +530 -0
  51. package/dist/dedup/analyzer/canonical.js +74 -0
  52. package/dist/dedup/analyzer/canonical.test.js +70 -0
  53. package/dist/dedup/analyzer/cosine_clusters.js +169 -0
  54. package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
  55. package/dist/dedup/analyzer/distinctive.js +85 -0
  56. package/dist/dedup/analyzer/distinctive.test.js +49 -0
  57. package/dist/dedup/analyzer/exact_clusters.js +63 -0
  58. package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
  59. package/dist/dedup/analyzer/index.js +14 -0
  60. package/dist/dedup/analyzer/multiplicity.js +110 -0
  61. package/dist/dedup/analyzer/multiplicity.test.js +123 -0
  62. package/dist/dedup/analyzer/order.js +22 -0
  63. package/dist/dedup/analyzer/partial_overlaps.js +65 -0
  64. package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
  65. package/dist/dedup/analyzer/preview.js +84 -0
  66. package/dist/dedup/analyzer/preview.test.js +46 -0
  67. package/dist/dedup/analyzer/safety.js +27 -0
  68. package/dist/dedup/analyzer/safety.test.js +39 -0
  69. package/dist/dedup/config.js +18 -0
  70. package/dist/dedup/configload.js +299 -0
  71. package/dist/dedup/configload.test.js +410 -0
  72. package/dist/dedup/dedup.index.test.js +203 -0
  73. package/dist/dedup/dedup.js +143 -0
  74. package/dist/dedup/dedup.test.js +212 -0
  75. package/dist/dedup/dedupcfg/config.js +112 -0
  76. package/dist/dedup/dedupcfg/config.test.js +70 -0
  77. package/dist/dedup/dedupcfg/index.js +1 -0
  78. package/dist/dedup/deduptypes/index.js +1 -0
  79. package/dist/dedup/deduptypes/types.js +9 -0
  80. package/dist/dedup/deduptypes/types.test.js +34 -0
  81. package/dist/dedup/embedder/cache.js +23 -0
  82. package/dist/dedup/embedder/cache.test.js +50 -0
  83. package/dist/dedup/embedder/constants.js +10 -0
  84. package/dist/dedup/embedder/embedder.js +76 -0
  85. package/dist/dedup/embedder/embedder.mock.test.js +128 -0
  86. package/dist/dedup/embedder/embedder.test.js +96 -0
  87. package/dist/dedup/embedder/errors.js +20 -0
  88. package/dist/dedup/embedder/errors.test.js +35 -0
  89. package/dist/dedup/embedder/index.js +4 -0
  90. package/dist/dedup/embedder/session.js +78 -0
  91. package/dist/dedup/embedder/session.test.js +172 -0
  92. package/dist/dedup/gitignore.js +97 -0
  93. package/dist/dedup/gitignore.test.js +98 -0
  94. package/dist/dedup/index.js +11 -0
  95. package/dist/dedup/indexdb/errors.js +48 -0
  96. package/dist/dedup/indexdb/index.js +6 -0
  97. package/dist/dedup/indexdb/indexdb.js +302 -0
  98. package/dist/dedup/indexdb/indexdb.test.js +739 -0
  99. package/dist/dedup/indexdb/load.js +110 -0
  100. package/dist/dedup/indexdb/migrations.js +58 -0
  101. package/dist/dedup/indexdb/schema.js +83 -0
  102. package/dist/dedup/indexer/index.js +9 -0
  103. package/dist/dedup/indexer/indexer.js +501 -0
  104. package/dist/dedup/indexer/indexer.test.js +510 -0
  105. package/dist/dedup/indexer/links.js +89 -0
  106. package/dist/dedup/mdsection/anchor.js +60 -0
  107. package/dist/dedup/mdsection/anchor.test.js +39 -0
  108. package/dist/dedup/mdsection/blocks.js +409 -0
  109. package/dist/dedup/mdsection/blocks.test.js +359 -0
  110. package/dist/dedup/mdsection/index.js +4 -0
  111. package/dist/dedup/mdsection/parse.js +21 -0
  112. package/dist/dedup/mdsection/section.js +234 -0
  113. package/dist/dedup/mdsection/section.test.js +221 -0
  114. package/dist/dedup/report/floatfmt.js +71 -0
  115. package/dist/dedup/report/floatfmt.test.js +42 -0
  116. package/dist/dedup/report/index.js +8 -0
  117. package/dist/dedup/report/quote.js +77 -0
  118. package/dist/dedup/report/quote.test.js +67 -0
  119. package/dist/dedup/report/text.js +251 -0
  120. package/dist/dedup/report/text.test.js +420 -0
  121. package/dist/dedup/report_types.js +8 -0
  122. package/dist/dedup/sectionid/index.js +1 -0
  123. package/dist/dedup/sectionid/sectionid.js +16 -0
  124. package/dist/dedup/sectionid/sectionid.test.js +49 -0
  125. package/dist/guard/api/errors.js +12 -0
  126. package/dist/guard/api/index.js +2 -0
  127. package/dist/guard/api/parser.js +81 -0
  128. package/dist/guard/api/parser.test.js +58 -0
  129. package/dist/guard/api/types.js +1 -0
  130. package/dist/guard/code/errors.js +16 -0
  131. package/dist/guard/code/index.js +2 -0
  132. package/dist/guard/code/parser.js +54 -0
  133. package/dist/guard/code/parser.test.js +111 -0
  134. package/dist/guard/code/types.js +6 -0
  135. package/dist/index.js +1 -0
  136. package/dist/index.test.js +5 -0
  137. package/dist/repo/boundary.js +92 -0
  138. package/dist/repo/boundary.test.js +65 -0
  139. package/dist/repo/errors.js +56 -0
  140. package/dist/repo/errors.test.js +85 -0
  141. package/dist/repo/exists.test.js +72 -0
  142. package/dist/repo/filename.js +46 -0
  143. package/dist/repo/filename.test.js +39 -0
  144. package/dist/repo/fs.js +53 -0
  145. package/dist/repo/index.js +7 -0
  146. package/dist/repo/overlay.js +36 -0
  147. package/dist/repo/overlay.test.js +80 -0
  148. package/dist/repo/repo.js +353 -0
  149. package/dist/repo/repo.test.js +255 -0
  150. package/dist/repo/testutil.js +27 -0
  151. package/dist/repo/write.test.js +125 -0
  152. package/dist/report/color.js +73 -0
  153. package/dist/report/index.js +1 -0
  154. package/dist/report/report.js +112 -0
  155. package/dist/report/report.test.js +368 -0
  156. package/dist/violation/index.js +1 -0
  157. package/dist/violation/types.js +22 -0
  158. package/dist/violation/types.test.js +70 -0
  159. package/package.json +48 -0
@@ -0,0 +1,221 @@
1
+ import { readFileSync } from "node:fs";
2
+ import { fileURLToPath } from "node:url";
3
+ import { createHash } from "node:crypto";
4
+ import { describe, expect, it } from "vitest";
5
+ import { extract, extractFromFile, extractFromFileWithBlocks, } from "./index.js";
6
+ const here = fileURLToPath(new URL(".", import.meta.url));
7
+ const fixtureFile = `${here}testdata/fixture.md`;
8
+ function loadFixture() {
9
+ return readFileSync(fixtureFile, "utf8");
10
+ }
11
+ function extractFixture() {
12
+ return extract(fixtureFile, loadFixture());
13
+ }
14
+ describe("extract — section splitting", () => {
15
+ // The exclusive-ownership invariant is what lets clustering attribute every
16
+ // duplicated line to exactly one section; overlapping ranges would double-count.
17
+ it("produces non-overlapping [start_line, end_line) ranges", () => {
18
+ const sections = extractFixture();
19
+ for (let i = 0; i < sections.length; i++) {
20
+ for (let j = i + 1; j < sections.length; j++) {
21
+ const a = sections[i];
22
+ const b = sections[j];
23
+ if (a.file_path !== b.file_path)
24
+ continue;
25
+ const overlap = a.start_line < b.end_line && b.start_line < a.end_line;
26
+ expect(overlap, `sections ${i} and ${j} overlap`).toBe(false);
27
+ }
28
+ }
29
+ });
30
+ // All-eligible fixture: the union of section spans must exactly cover the file
31
+ // from its first heading to EOF — the contiguity half of exclusive ownership.
32
+ it("covers every line from the first heading to EOF (all-eligible fixture)", () => {
33
+ const allEligibleFile = `${here}testdata/all_eligible.md`;
34
+ const data = readFileSync(allEligibleFile, "utf8");
35
+ const sections = extract(allEligibleFile, data);
36
+ const lines = data.replace(/\n+$/, "").split("\n");
37
+ const totalLines = lines.length;
38
+ let firstHeadingLine = 0;
39
+ for (let i = 0; i < lines.length; i++) {
40
+ if (lines[i].startsWith("#")) {
41
+ firstHeadingLine = i + 1;
42
+ break;
43
+ }
44
+ }
45
+ const expected = totalLines - firstHeadingLine + 1;
46
+ let got = 0;
47
+ for (const s of sections)
48
+ got += s.end_line - s.start_line;
49
+ expect(got).toBe(expected);
50
+ });
51
+ // The eligibility gate is the dedup pool's admission filter: a code-only
52
+ // section carries no prose to embed, so admitting it would pollute clusters.
53
+ it("excludes sections below the prose-word threshold (Code Only)", () => {
54
+ const sections = extractFixture();
55
+ expect(sections.find((s) => s.heading === "Code Only")).toBeUndefined();
56
+ });
57
+ // Heading levels drive nothing downstream here but a wrong level would mean we
58
+ // mis-parsed the heading; the fixture is all H2, so every section must be 2.
59
+ it("records the heading level", () => {
60
+ for (const s of extractFixture()) {
61
+ expect(s.heading_level, s.heading).toBe(2);
62
+ }
63
+ });
64
+ // Line numbers are 1-indexed and a section always spans at least its heading,
65
+ // so start_line>0 and end_line>start_line are structural invariants.
66
+ it("uses 1-indexed lines with end_line > start_line", () => {
67
+ for (const s of extractFixture()) {
68
+ expect(s.start_line, s.heading).toBeGreaterThan(0);
69
+ expect(s.end_line, s.heading).toBeGreaterThan(s.start_line);
70
+ }
71
+ });
72
+ // raw_content must begin with the heading line so the on-disk section text can
73
+ // be located and re-displayed verbatim from the stored record.
74
+ it("starts raw_content with the heading line", () => {
75
+ for (const s of extractFixture()) {
76
+ expect(s.raw_content.startsWith(`## ${s.heading}`), s.heading).toBe(true);
77
+ }
78
+ });
79
+ });
80
+ describe("extract — anchors and IDs", () => {
81
+ // GitHub-style collision suffixes ("installation", "installation-1") must be
82
+ // assigned per-file in document order; this is what keeps two same-named
83
+ // headings addressable as distinct sections.
84
+ it("suffixes duplicate-heading anchors in order", () => {
85
+ const anchors = extractFixture()
86
+ .filter((s) => s.heading === "Installation")
87
+ .map((s) => s.anchor);
88
+ expect(anchors).toEqual(["installation", "installation-1"]);
89
+ });
90
+ // Because the anchor is part of the sectionid hash input, collision-suffix
91
+ // anchors MUST yield distinct IDs — otherwise two sections would key alike.
92
+ it("derives distinct IDs from collision-suffixed anchors", () => {
93
+ const ids = extractFixture()
94
+ .filter((s) => s.heading === "Installation")
95
+ .map((s) => s.id);
96
+ expect(ids).toHaveLength(2);
97
+ expect(ids[0]).not.toBe(ids[1]);
98
+ });
99
+ // A CJK heading is letters under \p{L}, so it survives slugification intact and
100
+ // must still produce a valid 16-char ID and pass the prose gate. Guards that the
101
+ // Unicode strip rule isn't ASCII-only.
102
+ it("handles a CJK heading (non-empty anchor, 16-char ID, eligible)", () => {
103
+ const s = extractFixture().find((x) => x.heading === "補貨單流程");
104
+ expect(s, "CJK section").toBeDefined();
105
+ expect(s.anchor).not.toBe("");
106
+ expect(s.id).toHaveLength(16);
107
+ expect(s.prose_word_count).toBeGreaterThanOrEqual(10);
108
+ });
109
+ });
110
+ describe("extract — derived fields", () => {
111
+ // The table linearization format ("h=v, h=v; …") is the locked embed
112
+ // representation; clustering compares these strings, so the exact shape matters.
113
+ it("linearizes a table into embed_text and sets has_table", () => {
114
+ const s = extractFixture().find((x) => x.heading === "Table Section");
115
+ expect(s, "Table Section").toBeDefined();
116
+ expect(s.has_table).toBe(true);
117
+ expect(s.embed_text).toContain("header a=val1a");
118
+ });
119
+ // has_code must stay false for prose-only sections; a stray true would mean we
120
+ // misclassified an inline span as a code block and skewed eligibility.
121
+ it("leaves has_code false for a prose-only section", () => {
122
+ const s = extractFixture().find((x) => x.heading === "Installation");
123
+ expect(s.has_code).toBe(false);
124
+ });
125
+ // prose_word_count gates eligibility, so it must reflect real word counts; a
126
+ // section known to exceed the threshold must report >= 10.
127
+ it("counts prose words above the gate for the first Installation", () => {
128
+ const s = extractFixture().find((x) => x.heading === "Installation" && x.anchor === "installation");
129
+ expect(s.prose_word_count).toBeGreaterThanOrEqual(10);
130
+ });
131
+ // List items are flattened to prose and counted; otherwise a list-only section
132
+ // would be wrongly excluded from the dedup pool.
133
+ it("flattens list items into counted prose", () => {
134
+ const s = extractFixture().find((x) => x.heading === "List Section");
135
+ expect(s, "List Section").toBeDefined();
136
+ expect(s.prose_word_count).toBeGreaterThanOrEqual(10);
137
+ });
138
+ // content_hash is sha256(embed_text); pinning the formula catches any drift in
139
+ // how the hash is computed, which would silently re-key every stored section.
140
+ it("sets content_hash = sha256(embed_text)", () => {
141
+ for (const s of extractFixture()) {
142
+ expect(s.content_hash).not.toBe("");
143
+ const want = createHash("sha256").update(s.embed_text).digest("hex");
144
+ expect(s.content_hash, s.heading).toBe(want);
145
+ }
146
+ });
147
+ });
148
+ describe("extract — file path handling", () => {
149
+ // file_path is stored verbatim and feeds the ID hash, so it must equal exactly
150
+ // what the caller passed (relative or absolute), never normalized.
151
+ it("stores file_path verbatim", () => {
152
+ for (const s of extractFixture()) {
153
+ expect(s.file_path).toBe(fixtureFile);
154
+ }
155
+ });
156
+ // extractFromFile must be equivalent to reading then extract — it is the same
157
+ // parse, just sourced from disk.
158
+ it("extractFromFile equals extract on the same content", () => {
159
+ const a = extractFromFile(fixtureFile);
160
+ const b = extract(fixtureFile, loadFixture());
161
+ expect(a.length).toBe(b.length);
162
+ expect(a.map((s) => s.id)).toEqual(b.map((s) => s.id));
163
+ });
164
+ });
165
+ describe("extract — golden output", () => {
166
+ // The golden pins the FULL extraction (IDs, hashes, line ranges, embed text)
167
+ // byte-for-byte against the Go implementation's output. Any change to section
168
+ // boundaries, slugs, or normalization that diverges from Go fails here — these
169
+ // values persist in the index, so divergence is a hard regression.
170
+ it("matches the Go golden JSON exactly", () => {
171
+ const sections = extractFixture();
172
+ // The golden was generated with file_path = "testdata/fixture.md"; rewrite
173
+ // our absolute path back to that relative form so IDs and the path field line
174
+ // up. (IDs in the golden hash the relative path.)
175
+ const relSections = extract("testdata/fixture.md", loadFixture());
176
+ const want = JSON.parse(readFileSync(`${here}testdata/sections.golden.json`, "utf8"));
177
+ expect(relSections).toEqual(want);
178
+ // Sanity: the absolute-path run yields the same count and headings, only the
179
+ // path-derived fields differ.
180
+ expect(sections.map((s) => s.heading)).toEqual(want.map((s) => s.heading));
181
+ });
182
+ });
183
+ describe("extractFromFileWithBlocks", () => {
184
+ // Extract and the blocks API share one parse; the sections half MUST be
185
+ // behavior-identical to Extract or two callers would disagree on the index.
186
+ it("returns sections identical to extract", () => {
187
+ const want = extract(fixtureFile, loadFixture());
188
+ const { sections: got } = extractFromFileWithBlocks(fixtureFile);
189
+ expect(got.length).toBe(want.length);
190
+ for (let i = 0; i < want.length; i++) {
191
+ expect(got[i]).toEqual(want[i]);
192
+ }
193
+ });
194
+ // Block records exist for ALL headings pre-eligibility: a table-only section
195
+ // (0 prose words, section-ineligible) still yields a "table" record so partial-
196
+ // duplication detection sees its rows. Code/HTML-only sections yield none.
197
+ it("emits blocks for ineligible sections and skips code/html", () => {
198
+ const fixture = `${here}testdata/allsections.md`;
199
+ const { blocks } = extractFromFileWithBlocks(fixture);
200
+ const byKey = new Map();
201
+ for (const br of blocks) {
202
+ const k = `${br.Heading}\u0000${br.Kind}`;
203
+ const arr = byKey.get(k) ?? [];
204
+ arr.push(br);
205
+ byKey.set(k, arr);
206
+ }
207
+ const tableBlocks = byKey.get("Table Only Section\u0000table") ?? [];
208
+ expect(tableBlocks.length, "table-only section table record").toBeGreaterThan(0);
209
+ expect(tableBlocks[0].SectionID).not.toBe("");
210
+ expect(tableBlocks[0].FilePath).toBe(fixture);
211
+ expect(tableBlocks[0].Heading).toBe("Table Only Section");
212
+ const proseBlocks = byKey.get("Prose Section\u0000prose") ?? [];
213
+ expect(proseBlocks.length, "prose section prose record").toBeGreaterThan(0);
214
+ for (const br of proseBlocks)
215
+ expect(br.SectionID).not.toBe("");
216
+ // Code-only section yields no records at all.
217
+ expect(blocks.some((br) => br.Heading === "Code Only Section")).toBe(false);
218
+ // List inside an eligible section folds to "prose".
219
+ expect((byKey.get("Eligible With List\u0000prose") ?? []).length).toBeGreaterThan(0);
220
+ });
221
+ });
@@ -0,0 +1,71 @@
1
+ // formatFloat2 reproduces Go's fmt "%.2f" / strconv.FormatFloat(v, 'f', 2, 64)
2
+ // byte-for-byte: the correctly-rounded 2-decimal form of the EXACT float64
3
+ // value, using round-half-to-even on ties.
4
+ //
5
+ // WHY a custom formatter (not Number.prototype.toFixed): JS toFixed uses a
6
+ // different (round-half-away-ish, float-buggy) algorithm. e.g. Go renders
7
+ // (0.125).toFixed-equivalent as "0.12" (the exact double 0.125 lands on a
8
+ // half and ties to even), while JS toFixed yields "0.13". The dedup similarity
9
+ // scores are printed with %.2f and checked byte-for-byte by the golden oracle,
10
+ // so the rounding must match Go exactly.
11
+ //
12
+ // Algorithm: recover the exact rational value (num/den) of the double from its
13
+ // IEEE-754 bits via BigInt, compute round(value * 100) with round-half-to-even,
14
+ // then place the decimal point. Verified against Go's %.2f across 5000+ random
15
+ // values in [0,1) plus tie edge cases (all match).
16
+ /** formatFloat2 returns x formatted like Go's "%.2f". */
17
+ export function formatFloat2(x) {
18
+ if (!Number.isFinite(x)) {
19
+ // Go prints NaN/+Inf/-Inf for these; similarity scores are always finite,
20
+ // but mirror Go's tokens defensively rather than emitting "Infinity".
21
+ if (Number.isNaN(x)) {
22
+ return "NaN";
23
+ }
24
+ return x > 0 ? "+Inf" : "-Inf";
25
+ }
26
+ const neg = x < 0 || Object.is(x, -0);
27
+ const { num, den } = exactRational(Math.abs(x));
28
+ // q = round(num/den * 100) in hundredths, half-to-even.
29
+ const scaledNum = num * 100n;
30
+ let q = scaledNum / den;
31
+ const r = scaledNum % den;
32
+ const twice = r * 2n;
33
+ if (twice > den) {
34
+ q += 1n;
35
+ }
36
+ else if (twice === den && q % 2n === 1n) {
37
+ q += 1n; // tie -> round to even
38
+ }
39
+ let s = q.toString();
40
+ while (s.length < 3) {
41
+ s = "0" + s;
42
+ }
43
+ const intPart = s.slice(0, s.length - 2);
44
+ const frac = s.slice(s.length - 2);
45
+ return (neg ? "-" : "") + intPart + "." + frac;
46
+ }
47
+ // exactRational returns the exact value of a non-negative finite double as
48
+ // num/den (both BigInt, den a power of two), decoded from its IEEE-754 bits.
49
+ function exactRational(x) {
50
+ const buf = new DataView(new ArrayBuffer(8));
51
+ buf.setFloat64(0, x);
52
+ const hi = buf.getUint32(0);
53
+ const lo = buf.getUint32(4);
54
+ const exp = (hi >>> 20) & 0x7ff;
55
+ const mantHi = BigInt(hi & 0xfffff);
56
+ let mant = (mantHi << 32n) | BigInt(lo >>> 0);
57
+ let e;
58
+ if (exp === 0) {
59
+ // Subnormal.
60
+ e = -1074;
61
+ }
62
+ else {
63
+ mant |= 1n << 52n; // implicit leading bit
64
+ e = exp - 1075;
65
+ }
66
+ // value = mant * 2^e
67
+ if (e >= 0) {
68
+ return { num: mant << BigInt(e), den: 1n };
69
+ }
70
+ return { num: mant, den: 1n << BigInt(-e) };
71
+ }
@@ -0,0 +1,42 @@
1
+ // Pins formatFloat2 to Go's fmt "%.2f" semantics.
2
+ //
3
+ // WHY this matters and is NOT covered by the renderer tests: the dedup
4
+ // similarity scores are printed with %.2f and the golden oracle checks them
5
+ // byte-for-byte. JS Number.prototype.toFixed rounds differently from Go on
6
+ // half-way binary values (e.g. 0.125 -> "0.13" in JS but "0.12" in Go, because
7
+ // the exact double 0.125 is a tie and Go rounds half-to-even). The renderer
8
+ // fixtures only use values that happen to round the same either way (0.89,
9
+ // 0.95, 0.96), so a naive toFixed would pass them yet still corrupt a real
10
+ // cosine score that lands on a tie. These cases lock the Go behavior directly.
11
+ import { describe, it, expect } from "vitest";
12
+ import { formatFloat2 } from "./floatfmt.js";
13
+ describe("formatFloat2 matches Go %.2f", () => {
14
+ it("formats the dedup similarity values used in fixtures", () => {
15
+ expect(formatFloat2(0.89)).toBe("0.89");
16
+ expect(formatFloat2(0.95)).toBe("0.95");
17
+ expect(formatFloat2(0.96)).toBe("0.96");
18
+ expect(formatFloat2(1.0)).toBe("1.00");
19
+ expect(formatFloat2(0)).toBe("0.00");
20
+ });
21
+ it("rounds half-to-even on exact-tie doubles (diverges from JS toFixed)", () => {
22
+ // 0.125 is exactly representable and is a tie at the 2nd decimal; Go rounds
23
+ // to the even digit (0.12). JS (0.125).toFixed(2) === "0.13" — the bug this
24
+ // formatter exists to avoid.
25
+ expect(formatFloat2(0.125)).toBe("0.12");
26
+ // 0.375 is exact and ties up to even (0.38).
27
+ expect(formatFloat2(0.375)).toBe("0.38");
28
+ // 2.675 is NOT exactly 2.675 in binary (slightly less), so it rounds DOWN to
29
+ // 2.67 — matching Go and demonstrating exact-value (not decimal) rounding.
30
+ expect(formatFloat2(2.675)).toBe("2.67");
31
+ });
32
+ it("rounds non-tie values to nearest", () => {
33
+ expect(formatFloat2(0.135)).toBe("0.14");
34
+ expect(formatFloat2(0.025)).toBe("0.03");
35
+ expect(formatFloat2(0.005000001)).toBe("0.01");
36
+ expect(formatFloat2(0.004999999)).toBe("0.00");
37
+ });
38
+ it("pads to two fraction digits and a leading integer digit", () => {
39
+ expect(formatFloat2(0.5)).toBe("0.50");
40
+ expect(formatFloat2(3)).toBe("3.00");
41
+ });
42
+ });
@@ -0,0 +1,8 @@
1
+ // Barrel for the dedup report renderer.
2
+ //
3
+ // Port of internal/dedup/report. Exports Render (the plain-text report
4
+ // renderer) and QuoteHeading (the Python-repr-style heading quoter, exported in
5
+ // Go for the renderer's own use and for tests). formatFloat2 and wordWrap stay
6
+ // internal — they are implementation details with no Go-exported counterpart.
7
+ export { Render } from "./text.js";
8
+ export { QuoteHeading } from "./quote.js";
@@ -0,0 +1,77 @@
1
+ // QuoteHeading mimics Python's repr() for the string subset that appears in
2
+ // markdown headings. Ported from internal/dedup/report/quote.go.
3
+ //
4
+ // - Wrap in single quotes.
5
+ // - Backslash-escape '\', "'", control chars < 0x20, and DEL (0x7f).
6
+ // - Leave non-ASCII (UTF-8) bytes untouched — CJK characters render
7
+ // literally, NOT as \uXXXX.
8
+ //
9
+ // This is the locked rule from plan §366. The CJK case is tested explicitly:
10
+ // input "補貨單流程" must produce "'補貨單流程'" (NOT escape sequences).
11
+ //
12
+ // Byte-for-byte note: Go iterates over the string's BYTES and passes any byte
13
+ // >= 0x80 through verbatim. In TS we iterate over UTF-16 code units; every char
14
+ // with codepoint >= 0x80 (including each surrogate half of an astral char) is
15
+ // passed through verbatim, which reassembles to the identical bytes when the
16
+ // result is UTF-8 encoded. All escaped cases are ASCII (< 0x80), so the
17
+ // code-unit walk and the byte walk produce identical output.
18
+ /** QuoteHeading returns a Python-repr-style single-quoted form of s. */
19
+ export function QuoteHeading(s) {
20
+ let b = "'";
21
+ for (const ch of charUnits(s)) {
22
+ const c = ch.codePointAt(0);
23
+ // Non-ASCII: pass through verbatim (covers UTF-8 multi-byte sequences / CJK).
24
+ if (c >= 0x80) {
25
+ b += ch;
26
+ continue;
27
+ }
28
+ switch (ch) {
29
+ case "\\":
30
+ b += "\\\\";
31
+ break;
32
+ case "'":
33
+ b += "\\'";
34
+ break;
35
+ case "\t":
36
+ b += "\\t";
37
+ break;
38
+ case "\n":
39
+ b += "\\n";
40
+ break;
41
+ case "\r":
42
+ b += "\\r";
43
+ break;
44
+ default:
45
+ if (c < 0x20) {
46
+ // Other control chars — use \xNN.
47
+ b += "\\x" + hexDigit(c >> 4) + hexDigit(c & 0xf);
48
+ }
49
+ else if (c === 0x7f) {
50
+ b += "\\x7f";
51
+ }
52
+ else {
53
+ b += ch;
54
+ }
55
+ }
56
+ }
57
+ b += "'";
58
+ return b;
59
+ }
60
+ // charUnits yields one entry per UTF-16 code unit. We deliberately do NOT use a
61
+ // codepoint iterator (for…of over the string) for ASCII handling, but since all
62
+ // ASCII chars are single code units and astral chars (>= 0x80) pass through
63
+ // verbatim either way, iterating by code unit is equivalent and avoids any
64
+ // surrogate-pair edge case. We iterate by code unit so that a lone surrogate (if
65
+ // any) is still passed through as a single >= 0x80 unit, matching Go's
66
+ // byte-level pass-through of every non-ASCII byte.
67
+ function* charUnits(s) {
68
+ for (let i = 0; i < s.length; i++) {
69
+ yield s.charAt(i);
70
+ }
71
+ }
72
+ function hexDigit(n) {
73
+ if (n < 10) {
74
+ return String.fromCharCode(0x30 + n); // '0'..'9'
75
+ }
76
+ return String.fromCharCode(0x61 + n - 10); // 'a'..'f'
77
+ }
@@ -0,0 +1,67 @@
1
+ // Pins QuoteHeading to Go's Python-repr-style heading quoting (quote.go).
2
+ //
3
+ // WHY this matters and is NOT covered elsewhere: the dedup text report prints
4
+ // each group's heading via QuoteHeading and the golden oracle checks those
5
+ // bytes against the Go binary. Two locked rules must hold exactly or the oracle
6
+ // diverges: (1) non-ASCII bytes (CJK) pass through VERBATIM, never as \uXXXX
7
+ // escapes; (2) control chars, quote, and backslash are backslash-escaped in the
8
+ // precise Go spelling (\t \n \r, \xNN for other controls, \x7f for DEL). A
9
+ // regression in either rule silently corrupts every report heading.
10
+ import { describe, it, expect } from "vitest";
11
+ import { QuoteHeading } from "./quote.js";
12
+ describe("QuoteHeading", () => {
13
+ // WHY: the common case — a plain ASCII heading must be wrapped in single
14
+ // quotes with no escaping, the baseline the renderer relies on.
15
+ it("wraps a plain ASCII heading in single quotes", () => {
16
+ expect(QuoteHeading("Overview")).toBe("'Overview'");
17
+ });
18
+ // WHY: the locked CJK case from plan §366 — non-ASCII must render literally,
19
+ // NOT as \uXXXX. Escaping CJK here would make every Chinese heading mismatch
20
+ // the Go oracle byte-for-byte.
21
+ it("passes CJK characters through verbatim (no \\u escapes)", () => {
22
+ expect(QuoteHeading("補貨單流程")).toBe("'補貨單流程'");
23
+ });
24
+ // WHY: an embedded single quote must be backslash-escaped (\') so the quoted
25
+ // form stays a single Python-repr token; an unescaped quote would terminate
26
+ // the string early and corrupt the report.
27
+ it("escapes an embedded single quote", () => {
28
+ expect(QuoteHeading("it's")).toBe("'it\\'s'");
29
+ });
30
+ // WHY: a literal backslash must be doubled (\\) — Go escapes '\' first; a
31
+ // single backslash would be read as an escape lead-in downstream.
32
+ it("escapes a backslash", () => {
33
+ expect(QuoteHeading("a\\b")).toBe("'a\\\\b'");
34
+ });
35
+ // WHY: the three named control chars have dedicated short escapes in Go
36
+ // (\t \n \r); using \xNN for these would not match Go's output.
37
+ it("escapes tab, newline, and carriage return with named escapes", () => {
38
+ expect(QuoteHeading("a\tb\nc\rd")).toBe("'a\\tb\\nc\\rd'");
39
+ });
40
+ // WHY: other control chars (< 0x20) use the \xNN form with two lowercase hex
41
+ // digits; the hexDigit helper's 0-9 branch is exercised here (0x01 -> "01").
42
+ it("escapes a low control char as \\xNN with lowercase hex", () => {
43
+ expect(QuoteHeading("\x01")).toBe("'\\x01'");
44
+ });
45
+ // WHY: DEL (0x7f) is ASCII but non-printable; Go emits the literal "\x7f".
46
+ // This also drives hexDigit's a-f branch (0xf -> "f", 0x7 -> "7").
47
+ it("escapes DEL (0x7f) as \\x7f", () => {
48
+ expect(QuoteHeading("\x7f")).toBe("'\\x7f'");
49
+ });
50
+ // WHY: a printable ASCII char at the 0x20 boundary (space) must NOT be
51
+ // escaped — it falls through to the verbatim default, pinning the c >= 0x20
52
+ // branch that separates printable from control.
53
+ it("leaves a space and other printable ASCII unescaped", () => {
54
+ expect(QuoteHeading("a b")).toBe("'a b'");
55
+ });
56
+ // WHY: the empty heading must still produce the two surrounding quotes — the
57
+ // renderer wraps every heading and an empty one must not collapse to "".
58
+ it("returns just the quotes for an empty string", () => {
59
+ expect(QuoteHeading("")).toBe("''");
60
+ });
61
+ // WHY: an astral (>= 0x10000) char is two UTF-16 code units, each >= 0x80, so
62
+ // both pass through verbatim and reassemble to the original — covers the
63
+ // code-unit pass-through that the byte-equality note depends on.
64
+ it("passes an astral (surrogate-pair) char through unchanged", () => {
65
+ expect(QuoteHeading("a\u{1F600}b")).toBe("'a\u{1F600}b'");
66
+ });
67
+ });