docsgov 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +242 -0
- package/dist/apispec/apispec.js +401 -0
- package/dist/apispec/apispec.test.js +444 -0
- package/dist/apispec/errors.js +17 -0
- package/dist/apispec/index.js +2 -0
- package/dist/check/doclinks.js +167 -0
- package/dist/check/index.js +8 -0
- package/dist/check/run.js +391 -0
- package/dist/check/run.test.js +513 -0
- package/dist/check/suggest.js +134 -0
- package/dist/check/suggest.test.js +92 -0
- package/dist/check/tokens.js +125 -0
- package/dist/cmd/main.js +330 -0
- package/dist/cmd/main.test.js +422 -0
- package/dist/codeq/cache.js +71 -0
- package/dist/codeq/cache.test.js +67 -0
- package/dist/codeq/errors.js +52 -0
- package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
- package/dist/codeq/index.js +11 -0
- package/dist/codeq/resolve.test.js +109 -0
- package/dist/codeq/resolver.js +128 -0
- package/dist/codeq/resolver.test.js +124 -0
- package/dist/codeq/resolvers/go.js +242 -0
- package/dist/codeq/resolvers/go.test.js +143 -0
- package/dist/codeq/resolvers/java.js +349 -0
- package/dist/codeq/resolvers/java.test.js +138 -0
- package/dist/codeq/resolvers/java_queries.js +63 -0
- package/dist/codeq/resolvers/javascript.js +412 -0
- package/dist/codeq/resolvers/javascript.test.js +125 -0
- package/dist/codeq/resolvers/javascript_queries.js +46 -0
- package/dist/codeq/resolvers/typescript.js +366 -0
- package/dist/codeq/resolvers/typescript.test.js +180 -0
- package/dist/codeq/resolvers/typescript_queries.js +78 -0
- package/dist/codeq/signature.js +50 -0
- package/dist/codeq/signature.test.js +50 -0
- package/dist/codeq/suggest.js +96 -0
- package/dist/codeq/treesitter.js +122 -0
- package/dist/codeq/treesitter.test.js +118 -0
- package/dist/config/config.js +74 -0
- package/dist/config/config.test.js +98 -0
- package/dist/config/fs.js +116 -0
- package/dist/config/glob.js +82 -0
- package/dist/config/glob.test.js +61 -0
- package/dist/config/index.js +4 -0
- package/dist/dedup/analyzer/analyzer.js +533 -0
- package/dist/dedup/analyzer/analyzer.test.js +530 -0
- package/dist/dedup/analyzer/canonical.js +74 -0
- package/dist/dedup/analyzer/canonical.test.js +70 -0
- package/dist/dedup/analyzer/cosine_clusters.js +169 -0
- package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
- package/dist/dedup/analyzer/distinctive.js +85 -0
- package/dist/dedup/analyzer/distinctive.test.js +49 -0
- package/dist/dedup/analyzer/exact_clusters.js +63 -0
- package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
- package/dist/dedup/analyzer/index.js +14 -0
- package/dist/dedup/analyzer/multiplicity.js +110 -0
- package/dist/dedup/analyzer/multiplicity.test.js +123 -0
- package/dist/dedup/analyzer/order.js +22 -0
- package/dist/dedup/analyzer/partial_overlaps.js +65 -0
- package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
- package/dist/dedup/analyzer/preview.js +84 -0
- package/dist/dedup/analyzer/preview.test.js +46 -0
- package/dist/dedup/analyzer/safety.js +27 -0
- package/dist/dedup/analyzer/safety.test.js +39 -0
- package/dist/dedup/config.js +18 -0
- package/dist/dedup/configload.js +299 -0
- package/dist/dedup/configload.test.js +410 -0
- package/dist/dedup/dedup.index.test.js +203 -0
- package/dist/dedup/dedup.js +143 -0
- package/dist/dedup/dedup.test.js +212 -0
- package/dist/dedup/dedupcfg/config.js +112 -0
- package/dist/dedup/dedupcfg/config.test.js +70 -0
- package/dist/dedup/dedupcfg/index.js +1 -0
- package/dist/dedup/deduptypes/index.js +1 -0
- package/dist/dedup/deduptypes/types.js +9 -0
- package/dist/dedup/deduptypes/types.test.js +34 -0
- package/dist/dedup/embedder/cache.js +23 -0
- package/dist/dedup/embedder/cache.test.js +50 -0
- package/dist/dedup/embedder/constants.js +10 -0
- package/dist/dedup/embedder/embedder.js +76 -0
- package/dist/dedup/embedder/embedder.mock.test.js +128 -0
- package/dist/dedup/embedder/embedder.test.js +96 -0
- package/dist/dedup/embedder/errors.js +20 -0
- package/dist/dedup/embedder/errors.test.js +35 -0
- package/dist/dedup/embedder/index.js +4 -0
- package/dist/dedup/embedder/session.js +78 -0
- package/dist/dedup/embedder/session.test.js +172 -0
- package/dist/dedup/gitignore.js +97 -0
- package/dist/dedup/gitignore.test.js +98 -0
- package/dist/dedup/index.js +11 -0
- package/dist/dedup/indexdb/errors.js +48 -0
- package/dist/dedup/indexdb/index.js +6 -0
- package/dist/dedup/indexdb/indexdb.js +302 -0
- package/dist/dedup/indexdb/indexdb.test.js +739 -0
- package/dist/dedup/indexdb/load.js +110 -0
- package/dist/dedup/indexdb/migrations.js +58 -0
- package/dist/dedup/indexdb/schema.js +83 -0
- package/dist/dedup/indexer/index.js +9 -0
- package/dist/dedup/indexer/indexer.js +501 -0
- package/dist/dedup/indexer/indexer.test.js +510 -0
- package/dist/dedup/indexer/links.js +89 -0
- package/dist/dedup/mdsection/anchor.js +60 -0
- package/dist/dedup/mdsection/anchor.test.js +39 -0
- package/dist/dedup/mdsection/blocks.js +409 -0
- package/dist/dedup/mdsection/blocks.test.js +359 -0
- package/dist/dedup/mdsection/index.js +4 -0
- package/dist/dedup/mdsection/parse.js +21 -0
- package/dist/dedup/mdsection/section.js +234 -0
- package/dist/dedup/mdsection/section.test.js +221 -0
- package/dist/dedup/report/floatfmt.js +71 -0
- package/dist/dedup/report/floatfmt.test.js +42 -0
- package/dist/dedup/report/index.js +8 -0
- package/dist/dedup/report/quote.js +77 -0
- package/dist/dedup/report/quote.test.js +67 -0
- package/dist/dedup/report/text.js +251 -0
- package/dist/dedup/report/text.test.js +420 -0
- package/dist/dedup/report_types.js +8 -0
- package/dist/dedup/sectionid/index.js +1 -0
- package/dist/dedup/sectionid/sectionid.js +16 -0
- package/dist/dedup/sectionid/sectionid.test.js +49 -0
- package/dist/guard/api/errors.js +12 -0
- package/dist/guard/api/index.js +2 -0
- package/dist/guard/api/parser.js +81 -0
- package/dist/guard/api/parser.test.js +58 -0
- package/dist/guard/api/types.js +1 -0
- package/dist/guard/code/errors.js +16 -0
- package/dist/guard/code/index.js +2 -0
- package/dist/guard/code/parser.js +54 -0
- package/dist/guard/code/parser.test.js +111 -0
- package/dist/guard/code/types.js +6 -0
- package/dist/index.js +1 -0
- package/dist/index.test.js +5 -0
- package/dist/repo/boundary.js +92 -0
- package/dist/repo/boundary.test.js +65 -0
- package/dist/repo/errors.js +56 -0
- package/dist/repo/errors.test.js +85 -0
- package/dist/repo/exists.test.js +72 -0
- package/dist/repo/filename.js +46 -0
- package/dist/repo/filename.test.js +39 -0
- package/dist/repo/fs.js +53 -0
- package/dist/repo/index.js +7 -0
- package/dist/repo/overlay.js +36 -0
- package/dist/repo/overlay.test.js +80 -0
- package/dist/repo/repo.js +353 -0
- package/dist/repo/repo.test.js +255 -0
- package/dist/repo/testutil.js +27 -0
- package/dist/repo/write.test.js +125 -0
- package/dist/report/color.js +73 -0
- package/dist/report/index.js +1 -0
- package/dist/report/report.js +112 -0
- package/dist/report/report.test.js +368 -0
- package/dist/violation/index.js +1 -0
- package/dist/violation/types.js +22 -0
- package/dist/violation/types.test.js +70 -0
- package/package.json +48 -0
|
@@ -0,0 +1,530 @@
|
|
|
1
|
+
// Ported from the TestAnalyze_* cases in internal/dedup/analyzer/analyzer_test.go.
|
|
2
|
+
//
|
|
3
|
+
// These are the end-to-end spec for the layered pipeline: they pin which layer
|
|
4
|
+
// fires (L1 exact / L2 promotion / L3 cosine HIGH-vs-MAYBE / L4 differentiator /
|
|
5
|
+
// L5 blocks), how confidence and recommended action are derived, when groups and
|
|
6
|
+
// pairs are dropped, and that block-level overlaps are wired through. The
|
|
7
|
+
// per-primitive behavior (clusters, canonical rank, preview, distinctive tokens,
|
|
8
|
+
// differentiators, multiplicity, suppression) is covered by the sibling
|
|
9
|
+
// *.test.ts files; this file exercises the orchestrator that composes them.
|
|
10
|
+
import { describe, it, expect } from "vitest";
|
|
11
|
+
import { defaultConfig } from "../dedupcfg/index.js";
|
|
12
|
+
import { analyze } from "./analyzer.js";
|
|
13
|
+
const cfg = defaultConfig();
|
|
14
|
+
// makeSection builds a minimal Section (TS snake_case shape) mirroring the Go
|
|
15
|
+
// makeSection helper: StartLine=1, EndLine=10, prose_word_count=15.
|
|
16
|
+
function makeSection(id, filePath, heading, level, anchor, contentHash, rawContent, inbound) {
|
|
17
|
+
return {
|
|
18
|
+
id,
|
|
19
|
+
file_path: filePath,
|
|
20
|
+
heading,
|
|
21
|
+
heading_level: level,
|
|
22
|
+
anchor,
|
|
23
|
+
start_line: 1,
|
|
24
|
+
end_line: 10,
|
|
25
|
+
content_hash: contentHash,
|
|
26
|
+
raw_content: rawContent,
|
|
27
|
+
embed_text: heading,
|
|
28
|
+
prose_word_count: 15,
|
|
29
|
+
has_table: false,
|
|
30
|
+
has_code: false,
|
|
31
|
+
inbound_count: inbound,
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
// makeUnitVec2D returns the 2D unit vector in the direction (x, y); the analyzer
|
|
35
|
+
// treats embeddings as L2-normalized so the dot product equals cosine.
|
|
36
|
+
function makeUnitVec2D(x, y) {
|
|
37
|
+
const norm = Math.sqrt(x * x + y * y);
|
|
38
|
+
if (norm === 0) {
|
|
39
|
+
return [0, 0];
|
|
40
|
+
}
|
|
41
|
+
return [x / norm, y / norm];
|
|
42
|
+
}
|
|
43
|
+
// emb builds the section-id -> vector map the analyzer consumes.
|
|
44
|
+
function emb(entries) {
|
|
45
|
+
return new Map(Object.entries(entries));
|
|
46
|
+
}
|
|
47
|
+
// L1 must union sections that share a content_hash (cross-file) into a HIGH
|
|
48
|
+
// group and mark the duplicate ExactMatch, with no MAYBE pair — exact identity
|
|
49
|
+
// is the strongest signal and short-circuits cosine.
|
|
50
|
+
describe("analyze L1 exact grouping", () => {
|
|
51
|
+
it("groups equal-hash cross-file sections as HIGH with ExactMatch and no MAYBE pairs", () => {
|
|
52
|
+
const hash = "abcdef1234567890";
|
|
53
|
+
const raw = "## Overview\nSome content here for testing.";
|
|
54
|
+
const sA = makeSection("id1", "docs/a.md", "Overview", 2, "overview", hash, raw, 0);
|
|
55
|
+
const sB = makeSection("id2", "docs/b.md", "Overview", 2, "overview", hash, raw, 0);
|
|
56
|
+
const v = makeUnitVec2D(1, 0);
|
|
57
|
+
const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
|
|
58
|
+
expect(report.HighGroups.length).toBeGreaterThan(0);
|
|
59
|
+
expect(report.MaybePairs).toHaveLength(0);
|
|
60
|
+
const grp = report.HighGroups[0];
|
|
61
|
+
expect(grp.Members.length).toBeGreaterThanOrEqual(1);
|
|
62
|
+
// L1 sets ExactMatch on the duplicate member.
|
|
63
|
+
expect(grp.Members.some((m) => m.ExactMatch)).toBe(true);
|
|
64
|
+
});
|
|
65
|
+
it("groups same-file equal-hash sections too (L1 does not skip same-file)", () => {
|
|
66
|
+
const hash = "samehash1234567890";
|
|
67
|
+
const sA = makeSection("id1", "docs/a.md", "Overview", 2, "overview", hash, "## Overview\nSome test content.", 0);
|
|
68
|
+
const sB = makeSection("id2", "docs/a.md", "Summary", 3, "summary", hash, "## Summary\nSome test content.", 0);
|
|
69
|
+
const v = makeUnitVec2D(1, 0);
|
|
70
|
+
const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
|
|
71
|
+
expect(report.HighGroups.length).toBeGreaterThan(0);
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
// L3 turns a high-cosine cross-file pair into a HIGH group, but L3 SKIPS
|
|
75
|
+
// same-file pairs entirely (neither HIGH nor MAYBE), and a mid-band cosine
|
|
76
|
+
// without a shared distinctive heading token stays a MAYBE pair (never unioned).
|
|
77
|
+
describe("analyze L3 cosine grouping", () => {
|
|
78
|
+
it("groups a cross-file cosine>=thresh_high pair as HIGH", () => {
|
|
79
|
+
const raw = "## Order Lifecycle\nThe order transitions through states.";
|
|
80
|
+
const sA = makeSection("id1", "docs/a.md", "Order Lifecycle", 2, "order-lifecycle", "hash1", raw, 0);
|
|
81
|
+
const sB = makeSection("id2", "docs/b.md", "Order Lifecycle", 2, "order-lifecycle", "hash2", raw, 0);
|
|
82
|
+
const v = makeUnitVec2D(0.6, 0.8); // identical directions -> cosine 1.0
|
|
83
|
+
const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
|
|
84
|
+
expect(report.HighGroups.length).toBeGreaterThan(0);
|
|
85
|
+
});
|
|
86
|
+
it("skips a same-file pair entirely (no HIGH group, no MAYBE pair)", () => {
|
|
87
|
+
const sA = makeSection("id1", "docs/a.md", "Order Lifecycle", 2, "order-lifecycle", "hash1", "## Order Lifecycle\nThe order transitions through states.", 0);
|
|
88
|
+
const sB = makeSection("id2", "docs/a.md", "Order States", 2, "order-states", "hash2", "## Order States\nThe order state machine details.", 0);
|
|
89
|
+
const v = makeUnitVec2D(0.6, 0.8);
|
|
90
|
+
const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
|
|
91
|
+
expect(report.HighGroups).toHaveLength(0);
|
|
92
|
+
expect(report.MaybePairs).toHaveLength(0);
|
|
93
|
+
});
|
|
94
|
+
it("reports a mid-band cosine without shared distinctive token as a MAYBE pair, not a group", () => {
|
|
95
|
+
// Headings "Uploading Documents" / "Downloading Reports" share no distinctive
|
|
96
|
+
// token, so no L2 promotion; cosine ~0.90 sits in [thresh_maybe, thresh_high).
|
|
97
|
+
const sA = makeSection("id1", "docs/a.md", "Uploading Documents", 2, "uploading-documents", "hash1", "## Uploading Documents\nContent about uploading various files.", 0);
|
|
98
|
+
const sB = makeSection("id2", "docs/b.md", "Downloading Reports", 2, "downloading-reports", "hash2", "## Downloading Reports\nContent about downloading various files.", 0);
|
|
99
|
+
const theta = 0.451; // arccos(0.90)
|
|
100
|
+
const v1 = [1.0, 0.0];
|
|
101
|
+
const v2 = [Math.cos(theta), Math.sin(theta)];
|
|
102
|
+
const report = analyze([sA, sB], emb({ id1: v1, id2: v2 }), null, null, cfg);
|
|
103
|
+
expect(report.MaybePairs.length).toBeGreaterThan(0);
|
|
104
|
+
expect(report.HighGroups).toHaveLength(0);
|
|
105
|
+
});
|
|
106
|
+
});
|
|
107
|
+
// L4 differentiators NEVER demote the confidence tier — they only force the
|
|
108
|
+
// recommended action to manual_review. A cosine=1.0 sync/async pair must stay
|
|
109
|
+
// confidence="high" but action="manual_review".
|
|
110
|
+
describe("analyze L4 differentiator", () => {
|
|
111
|
+
it("keeps confidence high but forces action=manual_review when a differentiator is present", () => {
|
|
112
|
+
const sA = makeSection("id1", "docs/a.md", "Sync Processing", 2, "sync-processing", "hash1", "## Sync Processing\nThis section covers sync operations in detail.", 0);
|
|
113
|
+
const sB = makeSection("id2", "docs/b.md", "Async Processing", 2, "async-processing", "hash2", "## Async Processing\nThis section covers async operations in detail.", 0);
|
|
114
|
+
const v = makeUnitVec2D(0.6, 0.8);
|
|
115
|
+
const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
|
|
116
|
+
expect(report.HighGroups.length).toBeGreaterThan(0);
|
|
117
|
+
const grp = report.HighGroups[0];
|
|
118
|
+
expect(grp.Confidence).toBe("high");
|
|
119
|
+
expect(grp.Action).toBe("manual_review");
|
|
120
|
+
});
|
|
121
|
+
});
|
|
122
|
+
// A group whose every member is disqualified (here both headings are the
|
|
123
|
+
// blacklisted "Related") carries no actionable canonical, so it is dropped from
|
|
124
|
+
// the report entirely.
|
|
125
|
+
describe("analyze disqualified group filtering", () => {
|
|
126
|
+
it("drops a group where every member is disqualified", () => {
|
|
127
|
+
const hash = "disqhash12345678";
|
|
128
|
+
const raw = "## Related\nSee also other sections.";
|
|
129
|
+
const sA = makeSection("id1", "docs/a.md", "Related", 2, "related", hash, raw, 0);
|
|
130
|
+
const sB = makeSection("id2", "docs/b.md", "Related", 2, "related", hash, raw, 0);
|
|
131
|
+
const v = makeUnitVec2D(1, 0);
|
|
132
|
+
const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
|
|
133
|
+
expect(report.HighGroups).toHaveLength(0);
|
|
134
|
+
});
|
|
135
|
+
});
|
|
136
|
+
// Confidence/action mapping: an all-exact (or all-HIGH) group with no
|
|
137
|
+
// differentiator yields confidence="high" and action="replace_with_reference".
|
|
138
|
+
describe("analyze confidence mapping", () => {
|
|
139
|
+
it("maps an all-exact group to high + replace_with_reference", () => {
|
|
140
|
+
const hash = "highconfhash1234";
|
|
141
|
+
const raw = "## Lifecycle\nLong content here for canonical test.";
|
|
142
|
+
const sA = makeSection("id1", "docs/concepts/a.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
|
|
143
|
+
const sB = makeSection("id2", "docs/design/b.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
|
|
144
|
+
const v = makeUnitVec2D(1, 0);
|
|
145
|
+
const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
|
|
146
|
+
expect(report.HighGroups.length).toBeGreaterThan(0);
|
|
147
|
+
const grp = report.HighGroups[0];
|
|
148
|
+
expect(grp.Confidence).toBe("high");
|
|
149
|
+
expect(grp.Action).toBe("replace_with_reference");
|
|
150
|
+
});
|
|
151
|
+
});
|
|
152
|
+
// Member previews are pre-computed by the orchestrator (so the renderer never
|
|
153
|
+
// re-derives them): the canonical's Preview is non-empty and has the heading
|
|
154
|
+
// line stripped.
|
|
155
|
+
describe("analyze member preview", () => {
|
|
156
|
+
it("pre-computes the canonical member preview with the heading line stripped", () => {
|
|
157
|
+
const hash = "prevhash12345678";
|
|
158
|
+
const raw = "## Lifecycle\nThis is the detailed content of this lifecycle section.";
|
|
159
|
+
const sA = makeSection("id1", "docs/a.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
|
|
160
|
+
const sB = makeSection("id2", "docs/b.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
|
|
161
|
+
const v = makeUnitVec2D(1, 0);
|
|
162
|
+
const report = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
|
|
163
|
+
expect(report.HighGroups.length).toBeGreaterThan(0);
|
|
164
|
+
const grp = report.HighGroups[0];
|
|
165
|
+
expect(grp.Canonical.Preview).not.toBe("");
|
|
166
|
+
expect(grp.Canonical.Preview).not.toContain("## Lifecycle");
|
|
167
|
+
});
|
|
168
|
+
});
|
|
169
|
+
// L5 wiring: a verbatim cross-file block surfaces as ONE exact PartialOverlap
|
|
170
|
+
// cluster and is NOT re-reported as a cosine cluster (exact-pass excludeHashes
|
|
171
|
+
// suppresses the duplicate). Passing nil blocks yields an empty PartialOverlaps
|
|
172
|
+
// without disturbing the L1-L4 output.
|
|
173
|
+
describe("analyze L5 partial overlaps", () => {
|
|
174
|
+
it("populates PartialOverlaps from a verbatim block and dedups exact-vs-cosine", () => {
|
|
175
|
+
const sA = makeSection("secA", "docs/a.md", "Overview", 2, "overview", "secHash12345678", "## Overview\nSome unique content here.", 0);
|
|
176
|
+
const sB = makeSection("secB", "docs/b.md", "Details", 2, "details", "differentHash1", "## Details\nCompletely different content.", 0);
|
|
177
|
+
const v = makeUnitVec2D(1, 0);
|
|
178
|
+
const vB = makeUnitVec2D(0, 1); // orthogonal -> no L3 match
|
|
179
|
+
const sectionEmb = emb({ secA: v, secB: vB });
|
|
180
|
+
const verbatimHash = "verbatim-block-hash-xyz";
|
|
181
|
+
const blocks = [
|
|
182
|
+
mkBlock("secA", "docs/a.md", "Overview", "prose", 3, 6, verbatimHash),
|
|
183
|
+
mkBlock("secB", "docs/b.md", "Details", "prose", 4, 7, verbatimHash),
|
|
184
|
+
];
|
|
185
|
+
// verbatimHash also has a block embedding: it would be a cosine candidate
|
|
186
|
+
// were it not added to excludeHashes by the exact pass.
|
|
187
|
+
const blockEmb = emb({ [verbatimHash]: makeUnitVec2D(0.5, 0.5) });
|
|
188
|
+
const rep = analyze([sA, sB], sectionEmb, blocks, blockEmb, cfg);
|
|
189
|
+
expect(rep.PartialOverlaps).toHaveLength(1);
|
|
190
|
+
const cl = rep.PartialOverlaps[0];
|
|
191
|
+
expect(cl.Exact).toBe(true);
|
|
192
|
+
expect(cl.Kind).toBe("prose");
|
|
193
|
+
expect(cl.ContentHash).toBe(verbatimHash);
|
|
194
|
+
expect(cl.Locations).toHaveLength(2);
|
|
195
|
+
// The verbatim block appears exactly once as exact, never as cosine.
|
|
196
|
+
const exactCount = rep.PartialOverlaps.filter((c) => c.ContentHash === verbatimHash && c.Exact).length;
|
|
197
|
+
const cosineCount = rep.PartialOverlaps.filter((c) => c.ContentHash === verbatimHash && !c.Exact).length;
|
|
198
|
+
expect(exactCount).toBe(1);
|
|
199
|
+
expect(cosineCount).toBe(0);
|
|
200
|
+
});
|
|
201
|
+
it("yields empty PartialOverlaps for nil blocks without disturbing L1-L4 output", () => {
|
|
202
|
+
const hash = "noBlockHash12345";
|
|
203
|
+
const raw = "## Lifecycle\nContent for lifecycle section testing.";
|
|
204
|
+
const sA = makeSection("id1", "docs/a.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
|
|
205
|
+
const sB = makeSection("id2", "docs/b.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
|
|
206
|
+
const v = makeUnitVec2D(1, 0);
|
|
207
|
+
const rep = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, cfg);
|
|
208
|
+
expect(rep.PartialOverlaps).toHaveLength(0);
|
|
209
|
+
expect(rep.HighGroups.length).toBeGreaterThan(0);
|
|
210
|
+
expect(rep.MaybePairs).toHaveLength(0);
|
|
211
|
+
});
|
|
212
|
+
});
|
|
213
|
+
// Fewer than two sections cannot form a pair or group, so Analyze must
|
|
214
|
+
// short-circuit to a fully-empty report before doing any embedding work. Pinned
|
|
215
|
+
// because the rest of the pipeline assumes n>=2 (pair loops, union-find).
|
|
216
|
+
describe("analyze trivial input", () => {
|
|
217
|
+
// WHY: a single section can never duplicate anything; returning early avoids
|
|
218
|
+
// wasted work and guarantees an empty, well-formed report.
|
|
219
|
+
it("returns an empty report for fewer than two sections", () => {
|
|
220
|
+
const s = makeSection("only", "docs/a.md", "Solo", 2, "solo", "h", "## Solo\nx", 0);
|
|
221
|
+
const rep = analyze([s], emb({ only: makeUnitVec2D(1, 0) }), null, null, cfg);
|
|
222
|
+
expect(rep.HighGroups).toHaveLength(0);
|
|
223
|
+
expect(rep.MaybePairs).toHaveLength(0);
|
|
224
|
+
expect(rep.PartialOverlaps).toHaveLength(0);
|
|
225
|
+
});
|
|
226
|
+
// WHY: zero sections is the degenerate boundary; it must not throw and must
|
|
227
|
+
// yield an empty report (same n<2 guard).
|
|
228
|
+
it("returns an empty report for zero sections", () => {
|
|
229
|
+
const rep = analyze([], new Map(), null, null, cfg);
|
|
230
|
+
expect(rep.HighGroups).toHaveLength(0);
|
|
231
|
+
expect(rep.MaybePairs).toHaveLength(0);
|
|
232
|
+
expect(rep.PartialOverlaps).toHaveLength(0);
|
|
233
|
+
});
|
|
234
|
+
});
|
|
235
|
+
// L2 promotion is the layer that rescues a mid-band cosine pair from MAYBE to
|
|
236
|
+
// HIGH when the headings share a distinctive token. Without it, near-duplicate
|
|
237
|
+
// sections with telltale shared headings would be under-reported as mere pairs.
|
|
238
|
+
describe("analyze L2 heading promotion", () => {
|
|
239
|
+
// WHY: a [thresh_maybe, thresh_high) cosine WITH a shared distinctive heading
|
|
240
|
+
// token must be PROMOTED to a HIGH group (not left a MAYBE pair). This pins the
|
|
241
|
+
// promotion branch that the heading signal triggers.
|
|
242
|
+
it("promotes a mid-band cosine pair with a shared distinctive heading token to HIGH", () => {
|
|
243
|
+
// Three shared distinctive tokens (zebra, payment, apple) so the reason
|
|
244
|
+
// builder must SORT them lexicographically (apple, payment, zebra),
|
|
245
|
+
// exercising both directions of the string comparator.
|
|
246
|
+
const sA = makeSection("id1", "docs/a.md", "Zebra Payment Apple", 2, "zebra-payment-apple", "h1", "## Zebra Payment Apple\nReconcile incoming payments daily here.", 0);
|
|
247
|
+
const sB = makeSection("id2", "docs/b.md", "Zebra Payment Apple", 2, "zebra-payment-apple", "h2", "## Zebra Payment Apple\nReconcile the incoming payments daily here.", 0);
|
|
248
|
+
// cosine ~0.90 sits inside [0.86, 0.93); the shared distinctive tokens
|
|
249
|
+
// (freq 2 <= distinctive threshold 3) drive the promotion.
|
|
250
|
+
const theta = Math.acos(0.9);
|
|
251
|
+
const v1 = [1, 0];
|
|
252
|
+
const v2 = [Math.cos(theta), Math.sin(theta)];
|
|
253
|
+
const rep = analyze([sA, sB], emb({ id1: v1, id2: v2 }), null, null, cfg);
|
|
254
|
+
expect(rep.HighGroups.length).toBeGreaterThan(0);
|
|
255
|
+
expect(rep.MaybePairs).toHaveLength(0);
|
|
256
|
+
// The HIGH reason set records the shared distinctive heading tokens, sorted.
|
|
257
|
+
const grp = rep.HighGroups[0];
|
|
258
|
+
const allReasons = grp.Members.flatMap((m) => m.Reasons).join(" ");
|
|
259
|
+
expect(allReasons).toContain("shared distinctive heading tokens: apple, payment, zebra");
|
|
260
|
+
});
|
|
261
|
+
});
|
|
262
|
+
// A custom heading_token_min_len of 0 forces the analyzer to fall back to the
|
|
263
|
+
// default minimum (3). Pinned because the fallback guards against a misconfigured
|
|
264
|
+
// length that would otherwise admit one- and two-letter heading tokens.
|
|
265
|
+
describe("analyze min-token-len fallback", () => {
|
|
266
|
+
// WHY: heading_token_min_len <= 0 must be coerced to 3, so the distinctive
|
|
267
|
+
// filter behaves identically to the default. We assert grouping still works,
|
|
268
|
+
// exercising the <=0 fallback path.
|
|
269
|
+
it("coerces a non-positive heading_token_min_len to the default", () => {
|
|
270
|
+
const custom = defaultConfig();
|
|
271
|
+
custom.Markdown.heading_token_min_len = 0;
|
|
272
|
+
const hash = "minlenhash123456";
|
|
273
|
+
const raw = "## Reconciliation\nSome content for the fallback test here.";
|
|
274
|
+
const sA = makeSection("id1", "docs/a.md", "Reconciliation", 2, "reconciliation", hash, raw, 0);
|
|
275
|
+
const sB = makeSection("id2", "docs/b.md", "Reconciliation", 2, "reconciliation", hash, raw, 0);
|
|
276
|
+
const v = makeUnitVec2D(1, 0);
|
|
277
|
+
const rep = analyze([sA, sB], emb({ id1: v, id2: v.slice() }), null, null, custom);
|
|
278
|
+
expect(rep.HighGroups.length).toBeGreaterThan(0);
|
|
279
|
+
});
|
|
280
|
+
});
|
|
281
|
+
// L1 must union three sections that share a content_hash into ONE group; the
|
|
282
|
+
// third union sees an already-merged root, exercising the no-op union branch.
|
|
283
|
+
describe("analyze L1 three-way exact", () => {
|
|
284
|
+
// WHY: three equal-hash cross-file sections collapse into a single HIGH group,
|
|
285
|
+
// not three pairwise groups. This pins union-find idempotence (the third
|
|
286
|
+
// union(ra===rb) is a no-op) and the all-exact -> high confidence mapping.
|
|
287
|
+
it("collapses three equal-hash sections into one HIGH group", () => {
|
|
288
|
+
const hash = "triplehash123456";
|
|
289
|
+
const raw = "## Shared\nThe very same content repeated across three docs.";
|
|
290
|
+
const sA = makeSection("id1", "docs/a.md", "Shared", 2, "shared", hash, raw, 0);
|
|
291
|
+
const sB = makeSection("id2", "docs/b.md", "Shared", 2, "shared", hash, raw, 0);
|
|
292
|
+
const sC = makeSection("id3", "docs/c.md", "Shared", 2, "shared", hash, raw, 0);
|
|
293
|
+
const v = makeUnitVec2D(1, 0);
|
|
294
|
+
const rep = analyze([sA, sB, sC], emb({ id1: v, id2: v.slice(), id3: v.slice() }), null, null, cfg);
|
|
295
|
+
expect(rep.HighGroups).toHaveLength(1);
|
|
296
|
+
// canonical + 2 duplicate members = 3 sections in one group.
|
|
297
|
+
expect(rep.HighGroups[0].Members).toHaveLength(2);
|
|
298
|
+
expect(rep.HighGroups[0].Confidence).toBe("high");
|
|
299
|
+
});
|
|
300
|
+
});
|
|
301
|
+
// A transitive HIGH group is built when A~B and B~C cross the high bar but A~C is
|
|
302
|
+
// below thresh_maybe (no recorded pair). The canonical's link to the far member
|
|
303
|
+
// must be reconstructed via the best transitive pair — this is the contract that
|
|
304
|
+
// keeps a distant group member's similarity/reason non-empty.
|
|
305
|
+
describe("analyze transitive group", () => {
|
|
306
|
+
// WHY: when the canonical has no DIRECT recorded pair with a transitively
|
|
307
|
+
// grouped member, buildGroup must fall back to the best pair among earlier
|
|
308
|
+
// members (findBestPairInfo) rather than emitting a 0-similarity bare member.
|
|
309
|
+
// A regression that dropped this fallback would mislabel real duplicates.
|
|
310
|
+
it("reconstructs a transitive member's similarity from the best intermediate pair", () => {
|
|
311
|
+
// A is the best canonical (docs/concepts/ has top path priority). B ranks
|
|
312
|
+
// before C (longer raw_content -> better NegLen) so sorted = [A, B, C].
|
|
313
|
+
// A·B = 0.95 (HIGH, unioned). B·C = 0.94 (HIGH, unioned). A·C = 0.80
|
|
314
|
+
// (< thresh_maybe, NOT recorded) -> C is only transitively in the group, with
|
|
315
|
+
// no direct (A,C) pair. Headings share no token (no L2 promotion noise).
|
|
316
|
+
const sA = makeSection("idA", "docs/concepts/a.md", "Alpha", 2, "alpha", "hA", "## Alpha\nAlpha content here.", 0);
|
|
317
|
+
const sB = makeSection("idB", "docs/guides/b.md", "Bravo", 2, "bravo", "hB", "## Bravo\nBravo content here padded longer to win NegLen tiebreak.", 0);
|
|
318
|
+
const sC = makeSection("idC", "docs/guides/c.md", "Charlie", 2, "charlie", "hC", "## Charlie\nShort.", 0);
|
|
319
|
+
const vA = [1, 0, 0];
|
|
320
|
+
const vB = [0.95, Math.sqrt(1 - 0.95 * 0.95), 0];
|
|
321
|
+
const vC = [0.8, 0.5765, 0.166]; // |vC|~=1; A·C=0.80, B·C~=0.94
|
|
322
|
+
const rep = analyze([sA, sB, sC], emb({ idA: vA, idB: vB, idC: vC }), null, null, cfg);
|
|
323
|
+
expect(rep.HighGroups).toHaveLength(1);
|
|
324
|
+
const grp = rep.HighGroups[0];
|
|
325
|
+
// A wins canonical via path priority.
|
|
326
|
+
expect(grp.Canonical.SectionID).toBe("idA");
|
|
327
|
+
expect(grp.Members).toHaveLength(2);
|
|
328
|
+
// The transitive member (idC) has no direct (A,C) pair; buildGroup falls back
|
|
329
|
+
// to the best pair among earlier members and finds (B,C), so idC carries that
|
|
330
|
+
// pair's similarity (not the 0.0 placeholder).
|
|
331
|
+
const cMember = grp.Members.find((m) => m.SectionID === "idC");
|
|
332
|
+
expect(cMember.Similarity).toBeGreaterThan(0.9);
|
|
333
|
+
});
|
|
334
|
+
// WHY: when ALL three pairwise links are recorded and one is a plain MAYBE
|
|
335
|
+
// (mid-band, no shared token) yet the trio is still grouped via promotion/high,
|
|
336
|
+
// the group confidence must drop to "medium" (its weakest internal pair is not
|
|
337
|
+
// high/promoted). This pins the medium-tier mapping the renderer keys on.
|
|
338
|
+
it("downgrades a group with a non-high internal pair to medium confidence", () => {
|
|
339
|
+
// A·B = 0.94 (HIGH). B·C = 0.90 + shared "settlement" -> promoted (HIGH).
|
|
340
|
+
// A·C = 0.90, NO shared distinctive token -> plain MAYBE (recorded, internal).
|
|
341
|
+
const sA = makeSection("idA", "docs/a.md", "Payment Reconciliation", 2, "payment-reconciliation", "hA", "## Payment Reconciliation\nReconcile payments across the ledger.", 0);
|
|
342
|
+
const sB = makeSection("idB", "docs/b.md", "Payment Settlement", 2, "payment-settlement", "hB", "## Payment Settlement\nSettle payments across the ledger.", 0);
|
|
343
|
+
const sC = makeSection("idC", "docs/c.md", "Settlement Workflow", 2, "settlement-workflow", "hC", "## Settlement Workflow\nWorkflow that drives settlement.", 0);
|
|
344
|
+
// vA=(1,0,0); A·B=0.94; A·C=0.90; B·C=0.90.
|
|
345
|
+
const vA = [1, 0, 0];
|
|
346
|
+
const vB = [0.94, Math.sqrt(1 - 0.94 * 0.94), 0]; // (0.94, 0.34117..., 0)
|
|
347
|
+
// Solve vC: c1=0.90; B·C=0.94*0.90 + 0.34117*c2 = 0.90 -> c2=0.15828;
|
|
348
|
+
// c3=sqrt(1 - 0.90^2 - 0.15828^2).
|
|
349
|
+
const c1 = 0.9;
|
|
350
|
+
const c2 = (0.9 - 0.94 * 0.9) / Math.sqrt(1 - 0.94 * 0.94);
|
|
351
|
+
const c3 = Math.sqrt(1 - c1 * c1 - c2 * c2);
|
|
352
|
+
const vC = [c1, c2, c3];
|
|
353
|
+
const rep = analyze([sA, sB, sC], emb({ idA: vA, idB: vB, idC: vC }), null, null, cfg);
|
|
354
|
+
expect(rep.HighGroups).toHaveLength(1);
|
|
355
|
+
expect(rep.HighGroups[0].Confidence).toBe("medium");
|
|
356
|
+
expect(rep.HighGroups[0].Action).toBe("manual_review");
|
|
357
|
+
// The plain-MAYBE (A,C) pair is internal to the group, so it is NOT also
|
|
358
|
+
// emitted as a standalone MAYBE pair.
|
|
359
|
+
expect(rep.MaybePairs).toHaveLength(0);
|
|
360
|
+
});
|
|
361
|
+
});
|
|
362
|
+
// Possible (MAYBE) pairs are filtered: a pair whose BOTH endpoints are
|
|
363
|
+
// disqualified carries no actionable canonical and must be dropped before it
|
|
364
|
+
// reaches the report. Pinned because surfacing an un-actionable pair wastes
|
|
365
|
+
// reviewer time.
|
|
366
|
+
describe("analyze MAYBE pair filtering", () => {
|
|
367
|
+
// WHY: both sections sit on blacklisted (deprecated) paths, so even a genuine
|
|
368
|
+
// mid-band cosine pair has no canonical worth keeping -> dropped.
|
|
369
|
+
it("drops a MAYBE pair when both endpoints are disqualified", () => {
|
|
370
|
+
const sA = makeSection("id1", "docs/deprecated/a.md", "Uploading Documents", 2, "uploading-documents", "h1", "## Uploading Documents\nContent about uploading various files here.", 0);
|
|
371
|
+
const sB = makeSection("id2", "docs/deprecated/b.md", "Downloading Reports", 2, "downloading-reports", "h2", "## Downloading Reports\nContent about downloading various files here.", 0);
|
|
372
|
+
const theta = 0.451; // arccos(0.90) -> mid-band, no shared distinctive token
|
|
373
|
+
const v1 = [1, 0];
|
|
374
|
+
const v2 = [Math.cos(theta), Math.sin(theta)];
|
|
375
|
+
const rep = analyze([sA, sB], emb({ id1: v1, id2: v2 }), null, null, cfg);
|
|
376
|
+
expect(rep.MaybePairs).toHaveLength(0);
|
|
377
|
+
expect(rep.HighGroups).toHaveLength(0);
|
|
378
|
+
});
|
|
379
|
+
// WHY: when a surviving MAYBE pair's second endpoint (index j) ranks BETTER
|
|
380
|
+
// than the first (index i), the canonical must be j, not i. This pins the
|
|
381
|
+
// canonical-by-rank else branch in buildPair; choosing the wrong canonical
|
|
382
|
+
// would point reviewers at the worse section to keep.
|
|
383
|
+
it("elects the higher-priority endpoint as the pair canonical even when it is index j", () => {
|
|
384
|
+
// Index 0 (worse: plain path, deep heading); index 1 (better: docs/concepts/).
|
|
385
|
+
const sWorse = makeSection("id1", "docs/misc/a.md", "Uploading Documents", 4, "uploading-documents", "h1", "## Uploading Documents\nContent about uploading various files here.", 0);
|
|
386
|
+
const sBetter = makeSection("id2", "docs/concepts/b.md", "Downloading Reports", 2, "downloading-reports", "h2", "## Downloading Reports\nContent about downloading various files here.", 0);
|
|
387
|
+
const theta = 0.451;
|
|
388
|
+
const v1 = [1, 0];
|
|
389
|
+
const v2 = [Math.cos(theta), Math.sin(theta)];
|
|
390
|
+
const rep = analyze([sWorse, sBetter], emb({ id1: v1, id2: v2 }), null, null, cfg);
|
|
391
|
+
expect(rep.MaybePairs).toHaveLength(1);
|
|
392
|
+
// The better-ranked section (index j = id2) is elected canonical.
|
|
393
|
+
expect(rep.MaybePairs[0].Canonical.SectionID).toBe("id2");
|
|
394
|
+
expect(rep.MaybePairs[0].Candidate.SectionID).toBe("id1");
|
|
395
|
+
});
|
|
396
|
+
});
|
|
397
|
+
// Output ordering is part of the report contract: HIGH groups are sorted by
|
|
398
|
+
// (-size, -best_similarity) so the biggest/strongest duplicates lead, and MAYBE
|
|
399
|
+
// pairs are sorted by descending similarity. Pinned because a renderer that
|
|
400
|
+
// trusted input order would present duplicates in a meaningless sequence.
|
|
401
|
+
describe("analyze output ordering", () => {
|
|
402
|
+
// WHY: two HIGH groups of different sizes must be ordered largest-first
|
|
403
|
+
// (sortGroups via groupScore's -size term). A regression that lost the size
|
|
404
|
+
// ordering would bury the most impactful duplicate group below a smaller one.
|
|
405
|
+
it("orders HIGH groups largest-first", () => {
|
|
406
|
+
// Group X: 2 exact sections (hashX). Group Y: 3 exact sections (hashY).
|
|
407
|
+
const rawX = "## Topic X\nContent X repeated verbatim across two files here.";
|
|
408
|
+
const rawY = "## Topic Y\nContent Y repeated verbatim across three files here.";
|
|
409
|
+
const x1 = makeSection("x1", "docs/x1.md", "Topic X", 2, "topic-x", "hashX", rawX, 0);
|
|
410
|
+
const x2 = makeSection("x2", "docs/x2.md", "Topic X", 2, "topic-x", "hashX", rawX, 0);
|
|
411
|
+
const y1 = makeSection("y1", "docs/y1.md", "Topic Y", 2, "topic-y", "hashY", rawY, 0);
|
|
412
|
+
const y2 = makeSection("y2", "docs/y2.md", "Topic Y", 2, "topic-y", "hashY", rawY, 0);
|
|
413
|
+
const y3 = makeSection("y3", "docs/y3.md", "Topic Y", 2, "topic-y", "hashY", rawY, 0);
|
|
414
|
+
// X sections embed along dim 0, Y sections along dim 1 (orthogonal) so the
|
|
415
|
+
// two exact groups never cross-union via L3 cosine.
|
|
416
|
+
const vx = [1, 0];
|
|
417
|
+
const vy = [0, 1];
|
|
418
|
+
const rep = analyze([x1, x2, y1, y2, y3], emb({ x1: vx, x2: vx.slice(), y1: vy, y2: vy.slice(), y3: vy.slice() }), null, null, cfg);
|
|
419
|
+
expect(rep.HighGroups).toHaveLength(2);
|
|
420
|
+
// The 3-member group (Y) sorts before the 2-member group (X).
|
|
421
|
+
expect(rep.HighGroups[0].Members).toHaveLength(2); // canonical + 2 dups = 3
|
|
422
|
+
expect(rep.HighGroups[1].Members).toHaveLength(1); // canonical + 1 dup = 2
|
|
423
|
+
expect(rep.HighGroups[0].Canonical.Heading).toBe("Topic Y");
|
|
424
|
+
});
|
|
425
|
+
// WHY: two surviving MAYBE pairs with different similarities must be ordered
|
|
426
|
+
// most-similar-first (sortBySimDesc). A reviewer scans the top of the list, so
|
|
427
|
+
// the strongest candidate must lead.
|
|
428
|
+
it("orders MAYBE pairs by descending similarity", () => {
|
|
429
|
+
// Pair 1: cosine ~0.91. Pair 2: cosine ~0.88. Neither shares a distinctive
|
|
430
|
+
// heading token, so both stay MAYBE (no promotion, no union).
|
|
431
|
+
const p1a = makeSection("p1a", "docs/p1a.md", "Uploading Documents", 2, "uploading-documents", "h1a", "## Uploading Documents\nContent about uploading various files here.", 0);
|
|
432
|
+
const p1b = makeSection("p1b", "docs/p1b.md", "Downloading Reports", 2, "downloading-reports", "h1b", "## Downloading Reports\nContent about downloading various files here.", 0);
|
|
433
|
+
const p2a = makeSection("p2a", "docs/p2a.md", "Sending Emails", 2, "sending-emails", "h2a", "## Sending Emails\nContent about sending various messages here.", 0);
|
|
434
|
+
const p2b = makeSection("p2b", "docs/p2b.md", "Receiving Faxes", 2, "receiving-faxes", "h2b", "## Receiving Faxes\nContent about receiving various messages here.", 0);
|
|
435
|
+
// Each pair lives in its own orthogonal 4D subspace (pair1 in dims 0-1, pair2
|
|
436
|
+
// in dims 2-3) so the only mid-band cosines are WITHIN each pair; cross-pair
|
|
437
|
+
// dot products are 0 and never form spurious pairs.
|
|
438
|
+
const high = Math.acos(0.91);
|
|
439
|
+
const low = Math.acos(0.88);
|
|
440
|
+
const rep = analyze([p1a, p1b, p2a, p2b], emb({
|
|
441
|
+
p1a: [1, 0, 0, 0],
|
|
442
|
+
p1b: [Math.cos(high), Math.sin(high), 0, 0],
|
|
443
|
+
p2a: [0, 0, 1, 0],
|
|
444
|
+
p2b: [0, 0, Math.cos(low), Math.sin(low)],
|
|
445
|
+
}), null, null, cfg);
|
|
446
|
+
expect(rep.MaybePairs).toHaveLength(2);
|
|
447
|
+
// Descending similarity: the 0.91 pair leads the 0.88 pair.
|
|
448
|
+
expect(rep.MaybePairs[0].Similarity).toBeGreaterThan(rep.MaybePairs[1].Similarity);
|
|
449
|
+
});
|
|
450
|
+
// WHY: when the FIRST endpoint (index i) is the better canonical, buildPair
|
|
451
|
+
// must keep i as canonical (the then-branch). Combined with the j-better case
|
|
452
|
+
// above, this pins both arms of the canonical-by-rank choice for pairs.
|
|
453
|
+
it("keeps index i as the pair canonical when it ranks better", () => {
|
|
454
|
+
// index 0 sits in docs/concepts/ (top priority) -> i is the better canonical.
|
|
455
|
+
const sBetter = makeSection("id1", "docs/concepts/a.md", "Uploading Documents", 2, "uploading-documents", "h1", "## Uploading Documents\nContent about uploading various files here.", 0);
|
|
456
|
+
const sWorse = makeSection("id2", "docs/misc/b.md", "Downloading Reports", 4, "downloading-reports", "h2", "## Downloading Reports\nContent about downloading various files here.", 0);
|
|
457
|
+
const theta = 0.451;
|
|
458
|
+
const rep = analyze([sBetter, sWorse], emb({ id1: [1, 0], id2: [Math.cos(theta), Math.sin(theta)] }), null, null, cfg);
|
|
459
|
+
expect(rep.MaybePairs).toHaveLength(1);
|
|
460
|
+
expect(rep.MaybePairs[0].Canonical.SectionID).toBe("id1");
|
|
461
|
+
expect(rep.MaybePairs[0].Candidate.SectionID).toBe("id2");
|
|
462
|
+
});
|
|
463
|
+
});
|
|
464
|
+
// dotProduct must treat a missing or short embedding as cosine 0 (never NaN/throw)
|
|
465
|
+
// so a section without a usable vector simply forms no pairs. Pinned because a
|
|
466
|
+
// missing embedding must degrade gracefully, not corrupt the whole report.
|
|
467
|
+
describe("analyze missing/short embeddings", () => {
|
|
468
|
+
// WHY: a section with NO embedding entry yields dotProduct 0 (< thresh_maybe),
|
|
469
|
+
// producing no pair — the analyzer must not crash on the undefined vector.
|
|
470
|
+
it("treats a section with no embedding as cosine 0 (no pair)", () => {
|
|
471
|
+
const sA = makeSection("id1", "docs/a.md", "Alpha", 2, "alpha", "h1", "## Alpha\nContent for the missing-embedding test here.", 0);
|
|
472
|
+
const sB = makeSection("id2", "docs/b.md", "Beta", 2, "beta", "h2", "## Beta\nContent for the missing-embedding test here.", 0);
|
|
473
|
+
// id2 deliberately omitted from the embedding map -> vecs[1] is undefined.
|
|
474
|
+
const rep = analyze([sA, sB], emb({ id1: makeUnitVec2D(1, 0) }), null, null, cfg);
|
|
475
|
+
expect(rep.HighGroups).toHaveLength(0);
|
|
476
|
+
expect(rep.MaybePairs).toHaveLength(0);
|
|
477
|
+
});
|
|
478
|
+
// WHY: an embedding shorter than its counterpart (length mismatch) also yields
|
|
479
|
+
// cosine 0 via the length guard — defends against ragged vectors producing a
|
|
480
|
+
// spurious high score from a truncated dot product.
|
|
481
|
+
it("treats a length-mismatched embedding as cosine 0 (no pair)", () => {
|
|
482
|
+
const sA = makeSection("id1", "docs/a.md", "Alpha", 2, "alpha", "h1", "## Alpha\nContent for the short-embedding test here.", 0);
|
|
483
|
+
const sB = makeSection("id2", "docs/b.md", "Beta", 2, "beta", "h2", "## Beta\nContent for the short-embedding test here.", 0);
|
|
484
|
+
// id2 has an empty vector; b.length (0) < n (2) -> dotProduct returns 0.
|
|
485
|
+
const rep = analyze([sA, sB], emb({ id1: [1, 0], id2: [] }), null, null, cfg);
|
|
486
|
+
expect(rep.HighGroups).toHaveLength(0);
|
|
487
|
+
expect(rep.MaybePairs).toHaveLength(0);
|
|
488
|
+
});
|
|
489
|
+
});
|
|
490
|
+
// L5 secGroup wiring: when a HIGH group exists AND blocks are supplied, the
|
|
491
|
+
// section->group map must be populated so suppressKnownGroups can drop block
|
|
492
|
+
// overlaps already covered by that group. Pinned because an unpopulated secGroup
|
|
493
|
+
// would let L5 double-report blocks already captured by L1-L4.
|
|
494
|
+
describe("analyze L5 secGroup population", () => {
|
|
495
|
+
// WHY: two grouped sections each contribute a verbatim block; because both
|
|
496
|
+
// blocks' sections are in the SAME HIGH group, the block cluster is suppressed
|
|
497
|
+
// (already known) rather than re-reported by L5.
|
|
498
|
+
it("suppresses a block cluster whose sections are all in one HIGH group", () => {
|
|
499
|
+
const hash = "l5grouphash12345";
|
|
500
|
+
const raw = "## Lifecycle\nThe shared lifecycle content used in both docs.";
|
|
501
|
+
const sA = makeSection("secA", "docs/a.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
|
|
502
|
+
const sB = makeSection("secB", "docs/b.md", "Lifecycle", 2, "lifecycle", hash, raw, 0);
|
|
503
|
+
const v = makeUnitVec2D(1, 0);
|
|
504
|
+
// A verbatim block shared by both grouped sections.
|
|
505
|
+
const blockHash = "covered-block-hash";
|
|
506
|
+
const blocks = [
|
|
507
|
+
mkBlock("secA", "docs/a.md", "Lifecycle", "prose", 3, 6, blockHash),
|
|
508
|
+
mkBlock("secB", "docs/b.md", "Lifecycle", "prose", 3, 6, blockHash),
|
|
509
|
+
];
|
|
510
|
+
const rep = analyze([sA, sB], emb({ secA: v, secB: v.slice() }), blocks, null, cfg);
|
|
511
|
+
expect(rep.HighGroups.length).toBeGreaterThan(0);
|
|
512
|
+
// Both block locations resolve to the same HIGH group -> suppressed.
|
|
513
|
+
expect(rep.PartialOverlaps).toHaveLength(0);
|
|
514
|
+
});
|
|
515
|
+
});
|
|
516
|
+
// mkBlock builds a minimal BlockRecord for the L5 wiring tests.
|
|
517
|
+
function mkBlock(sectionID, filePath, heading, kind, startLine, endLine, contentHash) {
|
|
518
|
+
return {
|
|
519
|
+
SectionID: sectionID,
|
|
520
|
+
FilePath: filePath,
|
|
521
|
+
Heading: heading,
|
|
522
|
+
Index: 0,
|
|
523
|
+
Kind: kind,
|
|
524
|
+
StartLine: startLine,
|
|
525
|
+
EndLine: endLine,
|
|
526
|
+
ContentHash: contentHash,
|
|
527
|
+
Text: "",
|
|
528
|
+
TableRows: 0,
|
|
529
|
+
};
|
|
530
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
// Ported from internal/dedup/analyzer/canonical.go.
|
|
2
|
+
//
|
|
3
|
+
// Canonical-section selection: each duplicate group elects the section with the
|
|
4
|
+
// lowest rank tuple as its canonical reference. The tuple ordering is the
|
|
5
|
+
// behavior contract — drifting any field's comparison re-elects a different
|
|
6
|
+
// canonical and changes the recommended edits, so the order is pinned exactly.
|
|
7
|
+
import { headingBlacklisted } from "../dedupcfg/index.js";
|
|
8
|
+
/**
|
|
9
|
+
* isDisqualified returns true if the section matches the heading or path
|
|
10
|
+
* blacklist (case-insensitive substring on the path, shared heading matcher).
|
|
11
|
+
*/
|
|
12
|
+
export function isDisqualified(s, cfg) {
|
|
13
|
+
if (headingBlacklisted(cfg, s.heading)) {
|
|
14
|
+
return true;
|
|
15
|
+
}
|
|
16
|
+
const path = s.file_path.toLowerCase();
|
|
17
|
+
for (const tok of cfg.path_blacklist) {
|
|
18
|
+
if (path.includes(tok.toLowerCase())) {
|
|
19
|
+
return true;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* canonicalRank computes the rank tuple for a section.
|
|
26
|
+
* Lower tuple (compared via lessRank) = better canonical.
|
|
27
|
+
*
|
|
28
|
+
* NegLen mirrors Go's -len(raw_content), which is the UTF-8 BYTE length, not the
|
|
29
|
+
* rune/UTF-16 count — kept exact so the longest-content tiebreak matches Go.
|
|
30
|
+
*/
|
|
31
|
+
export function canonicalRank(s, cfg) {
|
|
32
|
+
const disq = isDisqualified(s, cfg) ? 1 : 0;
|
|
33
|
+
return {
|
|
34
|
+
Disqualified: disq,
|
|
35
|
+
PathPriority: pathPriorityRank(s.file_path, cfg.path_priority),
|
|
36
|
+
NegInbound: -s.inbound_count,
|
|
37
|
+
HeadingLevel: s.heading_level,
|
|
38
|
+
NegLen: -Buffer.byteLength(s.raw_content, "utf8"),
|
|
39
|
+
SectionID: s.id,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
/** lessRank returns true if a is strictly better (lower) than b. */
|
|
43
|
+
export function lessRank(a, b) {
|
|
44
|
+
if (a.Disqualified !== b.Disqualified) {
|
|
45
|
+
return a.Disqualified < b.Disqualified;
|
|
46
|
+
}
|
|
47
|
+
if (a.PathPriority !== b.PathPriority) {
|
|
48
|
+
return a.PathPriority < b.PathPriority;
|
|
49
|
+
}
|
|
50
|
+
if (a.NegInbound !== b.NegInbound) {
|
|
51
|
+
return a.NegInbound < b.NegInbound;
|
|
52
|
+
}
|
|
53
|
+
if (a.HeadingLevel !== b.HeadingLevel) {
|
|
54
|
+
return a.HeadingLevel < b.HeadingLevel;
|
|
55
|
+
}
|
|
56
|
+
if (a.NegLen !== b.NegLen) {
|
|
57
|
+
return a.NegLen < b.NegLen;
|
|
58
|
+
}
|
|
59
|
+
return a.SectionID < b.SectionID;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* pathPriorityRank returns the path priority index for the given file path.
|
|
63
|
+
* Lower index = higher priority. Path is case-folded before comparison.
|
|
64
|
+
*/
|
|
65
|
+
export function pathPriorityRank(filePath, pathPriority) {
|
|
66
|
+
const p = filePath.toLowerCase();
|
|
67
|
+
for (let i = 0; i < pathPriority.length; i++) {
|
|
68
|
+
const prefix = pathPriority[i];
|
|
69
|
+
if (p.startsWith(prefix.toLowerCase())) {
|
|
70
|
+
return i;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return pathPriority.length;
|
|
74
|
+
}
|