docsgov 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +242 -0
- package/dist/apispec/apispec.js +401 -0
- package/dist/apispec/apispec.test.js +444 -0
- package/dist/apispec/errors.js +17 -0
- package/dist/apispec/index.js +2 -0
- package/dist/check/doclinks.js +167 -0
- package/dist/check/index.js +8 -0
- package/dist/check/run.js +391 -0
- package/dist/check/run.test.js +513 -0
- package/dist/check/suggest.js +134 -0
- package/dist/check/suggest.test.js +92 -0
- package/dist/check/tokens.js +125 -0
- package/dist/cmd/main.js +330 -0
- package/dist/cmd/main.test.js +422 -0
- package/dist/codeq/cache.js +71 -0
- package/dist/codeq/cache.test.js +67 -0
- package/dist/codeq/errors.js +52 -0
- package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
- package/dist/codeq/index.js +11 -0
- package/dist/codeq/resolve.test.js +109 -0
- package/dist/codeq/resolver.js +128 -0
- package/dist/codeq/resolver.test.js +124 -0
- package/dist/codeq/resolvers/go.js +242 -0
- package/dist/codeq/resolvers/go.test.js +143 -0
- package/dist/codeq/resolvers/java.js +349 -0
- package/dist/codeq/resolvers/java.test.js +138 -0
- package/dist/codeq/resolvers/java_queries.js +63 -0
- package/dist/codeq/resolvers/javascript.js +412 -0
- package/dist/codeq/resolvers/javascript.test.js +125 -0
- package/dist/codeq/resolvers/javascript_queries.js +46 -0
- package/dist/codeq/resolvers/typescript.js +366 -0
- package/dist/codeq/resolvers/typescript.test.js +180 -0
- package/dist/codeq/resolvers/typescript_queries.js +78 -0
- package/dist/codeq/signature.js +50 -0
- package/dist/codeq/signature.test.js +50 -0
- package/dist/codeq/suggest.js +96 -0
- package/dist/codeq/treesitter.js +122 -0
- package/dist/codeq/treesitter.test.js +118 -0
- package/dist/config/config.js +74 -0
- package/dist/config/config.test.js +98 -0
- package/dist/config/fs.js +116 -0
- package/dist/config/glob.js +82 -0
- package/dist/config/glob.test.js +61 -0
- package/dist/config/index.js +4 -0
- package/dist/dedup/analyzer/analyzer.js +533 -0
- package/dist/dedup/analyzer/analyzer.test.js +530 -0
- package/dist/dedup/analyzer/canonical.js +74 -0
- package/dist/dedup/analyzer/canonical.test.js +70 -0
- package/dist/dedup/analyzer/cosine_clusters.js +169 -0
- package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
- package/dist/dedup/analyzer/distinctive.js +85 -0
- package/dist/dedup/analyzer/distinctive.test.js +49 -0
- package/dist/dedup/analyzer/exact_clusters.js +63 -0
- package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
- package/dist/dedup/analyzer/index.js +14 -0
- package/dist/dedup/analyzer/multiplicity.js +110 -0
- package/dist/dedup/analyzer/multiplicity.test.js +123 -0
- package/dist/dedup/analyzer/order.js +22 -0
- package/dist/dedup/analyzer/partial_overlaps.js +65 -0
- package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
- package/dist/dedup/analyzer/preview.js +84 -0
- package/dist/dedup/analyzer/preview.test.js +46 -0
- package/dist/dedup/analyzer/safety.js +27 -0
- package/dist/dedup/analyzer/safety.test.js +39 -0
- package/dist/dedup/config.js +18 -0
- package/dist/dedup/configload.js +299 -0
- package/dist/dedup/configload.test.js +410 -0
- package/dist/dedup/dedup.index.test.js +203 -0
- package/dist/dedup/dedup.js +143 -0
- package/dist/dedup/dedup.test.js +212 -0
- package/dist/dedup/dedupcfg/config.js +112 -0
- package/dist/dedup/dedupcfg/config.test.js +70 -0
- package/dist/dedup/dedupcfg/index.js +1 -0
- package/dist/dedup/deduptypes/index.js +1 -0
- package/dist/dedup/deduptypes/types.js +9 -0
- package/dist/dedup/deduptypes/types.test.js +34 -0
- package/dist/dedup/embedder/cache.js +23 -0
- package/dist/dedup/embedder/cache.test.js +50 -0
- package/dist/dedup/embedder/constants.js +10 -0
- package/dist/dedup/embedder/embedder.js +76 -0
- package/dist/dedup/embedder/embedder.mock.test.js +128 -0
- package/dist/dedup/embedder/embedder.test.js +96 -0
- package/dist/dedup/embedder/errors.js +20 -0
- package/dist/dedup/embedder/errors.test.js +35 -0
- package/dist/dedup/embedder/index.js +4 -0
- package/dist/dedup/embedder/session.js +78 -0
- package/dist/dedup/embedder/session.test.js +172 -0
- package/dist/dedup/gitignore.js +97 -0
- package/dist/dedup/gitignore.test.js +98 -0
- package/dist/dedup/index.js +11 -0
- package/dist/dedup/indexdb/errors.js +48 -0
- package/dist/dedup/indexdb/index.js +6 -0
- package/dist/dedup/indexdb/indexdb.js +302 -0
- package/dist/dedup/indexdb/indexdb.test.js +739 -0
- package/dist/dedup/indexdb/load.js +110 -0
- package/dist/dedup/indexdb/migrations.js +58 -0
- package/dist/dedup/indexdb/schema.js +83 -0
- package/dist/dedup/indexer/index.js +9 -0
- package/dist/dedup/indexer/indexer.js +501 -0
- package/dist/dedup/indexer/indexer.test.js +510 -0
- package/dist/dedup/indexer/links.js +89 -0
- package/dist/dedup/mdsection/anchor.js +60 -0
- package/dist/dedup/mdsection/anchor.test.js +39 -0
- package/dist/dedup/mdsection/blocks.js +409 -0
- package/dist/dedup/mdsection/blocks.test.js +359 -0
- package/dist/dedup/mdsection/index.js +4 -0
- package/dist/dedup/mdsection/parse.js +21 -0
- package/dist/dedup/mdsection/section.js +234 -0
- package/dist/dedup/mdsection/section.test.js +221 -0
- package/dist/dedup/report/floatfmt.js +71 -0
- package/dist/dedup/report/floatfmt.test.js +42 -0
- package/dist/dedup/report/index.js +8 -0
- package/dist/dedup/report/quote.js +77 -0
- package/dist/dedup/report/quote.test.js +67 -0
- package/dist/dedup/report/text.js +251 -0
- package/dist/dedup/report/text.test.js +420 -0
- package/dist/dedup/report_types.js +8 -0
- package/dist/dedup/sectionid/index.js +1 -0
- package/dist/dedup/sectionid/sectionid.js +16 -0
- package/dist/dedup/sectionid/sectionid.test.js +49 -0
- package/dist/guard/api/errors.js +12 -0
- package/dist/guard/api/index.js +2 -0
- package/dist/guard/api/parser.js +81 -0
- package/dist/guard/api/parser.test.js +58 -0
- package/dist/guard/api/types.js +1 -0
- package/dist/guard/code/errors.js +16 -0
- package/dist/guard/code/index.js +2 -0
- package/dist/guard/code/parser.js +54 -0
- package/dist/guard/code/parser.test.js +111 -0
- package/dist/guard/code/types.js +6 -0
- package/dist/index.js +1 -0
- package/dist/index.test.js +5 -0
- package/dist/repo/boundary.js +92 -0
- package/dist/repo/boundary.test.js +65 -0
- package/dist/repo/errors.js +56 -0
- package/dist/repo/errors.test.js +85 -0
- package/dist/repo/exists.test.js +72 -0
- package/dist/repo/filename.js +46 -0
- package/dist/repo/filename.test.js +39 -0
- package/dist/repo/fs.js +53 -0
- package/dist/repo/index.js +7 -0
- package/dist/repo/overlay.js +36 -0
- package/dist/repo/overlay.test.js +80 -0
- package/dist/repo/repo.js +353 -0
- package/dist/repo/repo.test.js +255 -0
- package/dist/repo/testutil.js +27 -0
- package/dist/repo/write.test.js +125 -0
- package/dist/report/color.js +73 -0
- package/dist/report/index.js +1 -0
- package/dist/report/report.js +112 -0
- package/dist/report/report.test.js +368 -0
- package/dist/violation/index.js +1 -0
- package/dist/violation/types.js +22 -0
- package/dist/violation/types.test.js +70 -0
- package/package.json +48 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
// Ported from internal/dedup/analyzer/multiplicity.go.
|
|
2
|
+
//
|
|
3
|
+
// Two post-processing passes over L5 clusters:
|
|
4
|
+
// - applyMultiplicity: downgrade boilerplate (a block in too many files) to
|
|
5
|
+
// Informational so it is reported but not flagged as accidental duplication.
|
|
6
|
+
// - suppressKnownGroups: drop clusters already fully captured by one L1-L4 HIGH
|
|
7
|
+
// group so L5 only surfaces NOVEL overlaps.
|
|
8
|
+
// The cap comparison and the "all-in-one-real-group" suppression predicate are
|
|
9
|
+
// behavior-load-bearing and reproduced exactly.
|
|
10
|
+
/**
|
|
11
|
+
* applyMultiplicity iterates over clusters and, for each, counts the number of
|
|
12
|
+
* DISTINCT FilePaths across its Locations. If that count EXCEEDS
|
|
13
|
+
* cfg.Block.multiplicity_cap, the cluster is downgraded to Informational=true
|
|
14
|
+
* (boilerplate heuristic). Otherwise Informational is left as-is.
|
|
15
|
+
*
|
|
16
|
+
* Returns a NEW slice of NEW Cluster values; input clusters are never mutated.
|
|
17
|
+
*/
|
|
18
|
+
export function applyMultiplicity(clusters, cfg) {
|
|
19
|
+
const multiplicityCap = cfg.Block.multiplicity_cap;
|
|
20
|
+
const out = [];
|
|
21
|
+
for (const cl of clusters) {
|
|
22
|
+
const seen = new Set();
|
|
23
|
+
for (const loc of cl.Locations) {
|
|
24
|
+
seen.add(loc.FilePath);
|
|
25
|
+
}
|
|
26
|
+
const newCl = { ...cl }; // copy all fields
|
|
27
|
+
if (seen.size > multiplicityCap) {
|
|
28
|
+
newCl.Informational = true;
|
|
29
|
+
}
|
|
30
|
+
out.push(newCl);
|
|
31
|
+
}
|
|
32
|
+
return out;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* suppressKnownGroups drops clusters that are already fully captured by a single
|
|
36
|
+
* L1-L4 HIGH group, so L5 only surfaces novel overlaps.
|
|
37
|
+
*
|
|
38
|
+
* secGroup maps section_id -> HIGH-group id; only sections that are members of a
|
|
39
|
+
* multi-member HIGH group appear. Sections absent from the map are "ungrouped".
|
|
40
|
+
*
|
|
41
|
+
* Because BlockLocation does not carry a section_id, this builds a join index
|
|
42
|
+
* from blocks: (FilePath, StartLine) -> SectionID. StartLine is unique within a
|
|
43
|
+
* file, so the key is sufficient.
|
|
44
|
+
*
|
|
45
|
+
* SUPPRESSION RULE: for each cluster, resolve every location to its section's
|
|
46
|
+
* group key — secGroup[sectionID] if present, else a unique sentinel per location
|
|
47
|
+
* (ungrouped). DROP the cluster IFF every location resolves to the SAME single
|
|
48
|
+
* real HIGH group. Otherwise KEEP the cluster unchanged.
|
|
49
|
+
*/
|
|
50
|
+
export function suppressKnownGroups(clusters, blocks, secGroup) {
|
|
51
|
+
// Build join index: (FilePath, StartLine) -> SectionID.
|
|
52
|
+
// A NUL-joined composite key cannot collide for distinct (path, line) pairs.
|
|
53
|
+
const joinIdx = new Map();
|
|
54
|
+
for (const b of blocks) {
|
|
55
|
+
joinIdx.set(locKey(b.FilePath, b.StartLine), b.SectionID);
|
|
56
|
+
}
|
|
57
|
+
// uniqueSentinel generates a unique NEGATIVE integer sentinel so ungrouped
|
|
58
|
+
// locations never collide with a real group id (all real group ids are >=0)
|
|
59
|
+
// and never collide with each other.
|
|
60
|
+
let sentinelNext = -1;
|
|
61
|
+
const nextSentinel = () => {
|
|
62
|
+
const v = sentinelNext;
|
|
63
|
+
sentinelNext--;
|
|
64
|
+
return v;
|
|
65
|
+
};
|
|
66
|
+
const out = [];
|
|
67
|
+
for (const cl of clusters) {
|
|
68
|
+
// Resolve each location to its group id or a unique sentinel.
|
|
69
|
+
const groupIDs = [];
|
|
70
|
+
for (const loc of cl.Locations) {
|
|
71
|
+
// Go: missing join entry yields the zero value "" for sectionID, which
|
|
72
|
+
// then misses secGroup too — preserved by the ?? "" default below.
|
|
73
|
+
const sectionID = joinIdx.get(locKey(loc.FilePath, loc.StartLine)) ?? "";
|
|
74
|
+
const gid = secGroup.get(sectionID);
|
|
75
|
+
if (gid !== undefined) {
|
|
76
|
+
groupIDs.push(gid);
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
// Ungrouped: unique sentinel so it cannot equal any real group.
|
|
80
|
+
groupIDs.push(nextSentinel());
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// Check if all locations resolve to the SAME single real group id.
|
|
84
|
+
// A real group id is >=0 (sentinels are negative).
|
|
85
|
+
let allSameGroup = true;
|
|
86
|
+
const first = groupIDs[0];
|
|
87
|
+
if (first < 0) {
|
|
88
|
+
// First location is ungrouped -> cannot be "all same real group".
|
|
89
|
+
allSameGroup = false;
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
for (let k = 1; k < groupIDs.length; k++) {
|
|
93
|
+
if (groupIDs[k] !== first) {
|
|
94
|
+
allSameGroup = false;
|
|
95
|
+
break;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
if (allSameGroup) {
|
|
100
|
+
// Entire cluster is inside one known HIGH group -> suppress.
|
|
101
|
+
continue;
|
|
102
|
+
}
|
|
103
|
+
out.push(cl);
|
|
104
|
+
}
|
|
105
|
+
return out;
|
|
106
|
+
}
|
|
107
|
+
/** Composite key for the (FilePath, StartLine) join index. */
|
|
108
|
+
function locKey(filePath, startLine) {
|
|
109
|
+
return `${filePath}\x00${startLine}`;
|
|
110
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { defaultConfig } from "../dedupcfg/index.js";
|
|
3
|
+
import { applyMultiplicity, suppressKnownGroups } from "./multiplicity.js";
|
|
4
|
+
function block(filePath, startLine, sectionID) {
|
|
5
|
+
return {
|
|
6
|
+
SectionID: sectionID,
|
|
7
|
+
FilePath: filePath,
|
|
8
|
+
Heading: "",
|
|
9
|
+
Index: 0,
|
|
10
|
+
Kind: "prose",
|
|
11
|
+
StartLine: startLine,
|
|
12
|
+
EndLine: startLine + 1,
|
|
13
|
+
ContentHash: "",
|
|
14
|
+
Text: "",
|
|
15
|
+
TableRows: 0,
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
// applyMultiplicity downgrades a block that recurs in MORE than the cap of
|
|
19
|
+
// distinct files to Informational: such a block is almost certainly intentional
|
|
20
|
+
// boilerplate, not an accidental duplication a reviewer should act on. The
|
|
21
|
+
// strict ">" boundary (cap itself stays actionable) and input immutability are
|
|
22
|
+
// the contract — a drift would either spam reviewers or hide real duplicates.
|
|
23
|
+
describe("applyMultiplicity", () => {
|
|
24
|
+
it("flags over-cap clusters Informational, leaves at/under-cap actionable, and never mutates input", () => {
|
|
25
|
+
const cfg = defaultConfig();
|
|
26
|
+
cfg.Block.multiplicity_cap = 3; // >3 distinct files -> informational
|
|
27
|
+
const overCap = {
|
|
28
|
+
Kind: "prose",
|
|
29
|
+
ContentHash: "over-cap-hash",
|
|
30
|
+
Exact: true,
|
|
31
|
+
Informational: false,
|
|
32
|
+
Similarity: 1.0,
|
|
33
|
+
Locations: [
|
|
34
|
+
{ FilePath: "docs/a.md", Heading: "H", StartLine: 1, EndLine: 5 },
|
|
35
|
+
{ FilePath: "docs/b.md", Heading: "H", StartLine: 1, EndLine: 5 },
|
|
36
|
+
{ FilePath: "docs/c.md", Heading: "H", StartLine: 1, EndLine: 5 },
|
|
37
|
+
{ FilePath: "docs/d.md", Heading: "H", StartLine: 1, EndLine: 5 },
|
|
38
|
+
],
|
|
39
|
+
};
|
|
40
|
+
const atCap = {
|
|
41
|
+
Kind: "prose",
|
|
42
|
+
ContentHash: "at-cap-hash",
|
|
43
|
+
Exact: true,
|
|
44
|
+
Informational: false,
|
|
45
|
+
Similarity: 1.0,
|
|
46
|
+
Locations: [
|
|
47
|
+
{ FilePath: "docs/a.md", Heading: "H", StartLine: 10, EndLine: 15 },
|
|
48
|
+
{ FilePath: "docs/b.md", Heading: "H", StartLine: 10, EndLine: 15 },
|
|
49
|
+
{ FilePath: "docs/c.md", Heading: "H", StartLine: 10, EndLine: 15 },
|
|
50
|
+
],
|
|
51
|
+
};
|
|
52
|
+
const underCap = {
|
|
53
|
+
Kind: "prose",
|
|
54
|
+
ContentHash: "under-cap-hash",
|
|
55
|
+
Exact: true,
|
|
56
|
+
Informational: false,
|
|
57
|
+
Similarity: 1.0,
|
|
58
|
+
Locations: [
|
|
59
|
+
{ FilePath: "docs/a.md", Heading: "S1", StartLine: 20, EndLine: 25 },
|
|
60
|
+
{ FilePath: "docs/a.md", Heading: "S2", StartLine: 30, EndLine: 35 },
|
|
61
|
+
],
|
|
62
|
+
};
|
|
63
|
+
const input = [overCap, atCap, underCap];
|
|
64
|
+
const got = applyMultiplicity(input, cfg);
|
|
65
|
+
expect(got).toHaveLength(3);
|
|
66
|
+
expect(got[0].Informational).toBe(true); // > cap
|
|
67
|
+
expect(got[1].Informational).toBe(false); // exactly cap
|
|
68
|
+
expect(got[2].Informational).toBe(false); // 1 distinct file
|
|
69
|
+
// Immutability: input must not be modified.
|
|
70
|
+
expect(input[0].Informational).toBe(false);
|
|
71
|
+
});
|
|
72
|
+
});
|
|
73
|
+
// suppressKnownGroups removes L5 clusters that an L1-L4 HIGH group already fully
|
|
74
|
+
// covers, so L5 only surfaces NOVEL overlaps. The predicate "drop iff every
|
|
75
|
+
// location resolves to the SAME single real group" is the contract: an ungrouped
|
|
76
|
+
// or cross-group member means there is something new to report, so it is kept.
|
|
77
|
+
describe("suppressKnownGroups", () => {
|
|
78
|
+
it("drops fully-covered clusters and keeps ungrouped / cross-group / all-ungrouped ones", () => {
|
|
79
|
+
// secA,secB share group 1; secC,secD share group 2; secE,secF ungrouped.
|
|
80
|
+
const secGroup = new Map([
|
|
81
|
+
["secA", 1],
|
|
82
|
+
["secB", 1],
|
|
83
|
+
["secC", 2],
|
|
84
|
+
["secD", 2],
|
|
85
|
+
]);
|
|
86
|
+
const blocks = [
|
|
87
|
+
block("docs/a.md", 1, "secA"),
|
|
88
|
+
block("docs/b.md", 1, "secB"),
|
|
89
|
+
block("docs/c.md", 1, "secC"),
|
|
90
|
+
block("docs/d.md", 1, "secD"),
|
|
91
|
+
block("docs/e.md", 1, "secE"), // ungrouped
|
|
92
|
+
block("docs/f.md", 1, "secF"), // ungrouped
|
|
93
|
+
];
|
|
94
|
+
const mkCluster = (hash, locs) => ({
|
|
95
|
+
Kind: "prose",
|
|
96
|
+
ContentHash: hash,
|
|
97
|
+
Exact: true,
|
|
98
|
+
Informational: false,
|
|
99
|
+
Similarity: 1.0,
|
|
100
|
+
Locations: locs.map(([fp, ln]) => ({ FilePath: fp, Heading: "", StartLine: ln, EndLine: ln + 1 })),
|
|
101
|
+
});
|
|
102
|
+
// (a) both in group 1 -> dropped.
|
|
103
|
+
const caseA = mkCluster("hash-a", [["docs/a.md", 1], ["docs/b.md", 1]]);
|
|
104
|
+
// (b) group 1, group 1, ungrouped -> kept (novel via secE).
|
|
105
|
+
const caseB = mkCluster("hash-b", [["docs/a.md", 1], ["docs/b.md", 1], ["docs/e.md", 1]]);
|
|
106
|
+
// (c) group 1 + group 2 -> kept (two groups).
|
|
107
|
+
const caseC = mkCluster("hash-c", [["docs/a.md", 1], ["docs/c.md", 1]]);
|
|
108
|
+
// (d) all-ungrouped -> kept.
|
|
109
|
+
const caseD = mkCluster("hash-d", [["docs/e.md", 1], ["docs/f.md", 1]]);
|
|
110
|
+
const got = suppressKnownGroups([caseA, caseB, caseC, caseD], blocks, secGroup);
|
|
111
|
+
// (a) hash-a dropped.
|
|
112
|
+
expect(got.some((cl) => cl.ContentHash === "hash-a")).toBe(false);
|
|
113
|
+
// remaining: b, c, d.
|
|
114
|
+
expect(got).toHaveLength(3);
|
|
115
|
+
// (b) hash-b kept with all 3 locations.
|
|
116
|
+
const bCluster = got.find((cl) => cl.ContentHash === "hash-b");
|
|
117
|
+
expect(bCluster).toBeDefined();
|
|
118
|
+
expect(bCluster.Locations).toHaveLength(3);
|
|
119
|
+
// (c) and (d) kept.
|
|
120
|
+
expect(got.some((cl) => cl.ContentHash === "hash-c")).toBe(true);
|
|
121
|
+
expect(got.some((cl) => cl.ContentHash === "hash-d")).toBe(true);
|
|
122
|
+
});
|
|
123
|
+
});
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
// Comparison helpers mirroring Go's cmp.Compare, used by the cluster sorters.
|
|
2
|
+
//
|
|
3
|
+
// Go's cmp.Compare returns -1 / 0 / +1 and, for strings, orders byte-by-byte
|
|
4
|
+
// (lexicographic). For the ASCII content hashes and docs/ file paths these
|
|
5
|
+
// sorters see, UTF-16 code-unit order equals Go's byte order, so a plain
|
|
6
|
+
// comparison reproduces the deterministic cluster ordering exactly.
|
|
7
|
+
/** cmpNum reproduces cmp.Compare for numbers: -1 if a<b, +1 if a>b, else 0. */
|
|
8
|
+
export function cmpNum(a, b) {
|
|
9
|
+
if (a < b)
|
|
10
|
+
return -1;
|
|
11
|
+
if (a > b)
|
|
12
|
+
return 1;
|
|
13
|
+
return 0;
|
|
14
|
+
}
|
|
15
|
+
/** cmpStr reproduces cmp.Compare for strings: lexicographic, -1 / 0 / +1. */
|
|
16
|
+
export function cmpStr(a, b) {
|
|
17
|
+
if (a < b)
|
|
18
|
+
return -1;
|
|
19
|
+
if (a > b)
|
|
20
|
+
return 1;
|
|
21
|
+
return 0;
|
|
22
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
// Ported from internal/dedup/analyzer/partial_overlaps.go.
|
|
2
|
+
//
|
|
3
|
+
// Combines the L5-exact and L5-cosine passes, applies the multiplicity cap,
|
|
4
|
+
// suppresses location-pairs already covered by an L1-L4 HIGH group, and returns
|
|
5
|
+
// the result in a deterministic total order. The exact pass feeds excludeHashes
|
|
6
|
+
// to the cosine pass so a verbatim block is never reported twice.
|
|
7
|
+
import { cosineClusters } from "./cosine_clusters.js";
|
|
8
|
+
import { exactClusters } from "./exact_clusters.js";
|
|
9
|
+
import { applyMultiplicity, suppressKnownGroups } from "./multiplicity.js";
|
|
10
|
+
import { cmpStr, cmpNum } from "./order.js";
|
|
11
|
+
/**
|
|
12
|
+
* partialOverlaps runs the full L5 pipeline.
|
|
13
|
+
*
|
|
14
|
+
* secGroup maps section_id -> HIGH group index (from the final groups slice in
|
|
15
|
+
* Analyze). Only sections that are members of a multi-member HIGH group appear;
|
|
16
|
+
* other sections are absent (ungrouped).
|
|
17
|
+
*
|
|
18
|
+
* The returned slice is sorted by
|
|
19
|
+
* (Locations[0].FilePath, Locations[0].StartLine, ContentHash) for stable
|
|
20
|
+
* interleaving of exact and cosine clusters.
|
|
21
|
+
*/
|
|
22
|
+
export function partialOverlaps(blocks, blockEmb, secGroup, cfg) {
|
|
23
|
+
// Step 1: exact clusters.
|
|
24
|
+
const exact = exactClusters(blocks);
|
|
25
|
+
// Step 2: build excludeHashes from exact clusters so verbatim pairs are not
|
|
26
|
+
// re-reported by the cosine pass.
|
|
27
|
+
const excludeHashes = new Set();
|
|
28
|
+
for (const cl of exact) {
|
|
29
|
+
if (cl.ContentHash !== "") {
|
|
30
|
+
excludeHashes.add(cl.ContentHash);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
// Step 3: cosine clusters (tables excluded, same-file pairs skipped).
|
|
34
|
+
const cosine = cosineClusters(blocks, blockEmb, excludeHashes, cfg);
|
|
35
|
+
// Step 4: combine into a fresh slice.
|
|
36
|
+
let all = [...exact, ...cosine];
|
|
37
|
+
// Step 5: apply multiplicity cap.
|
|
38
|
+
all = applyMultiplicity(all, cfg);
|
|
39
|
+
// Step 6: suppress clusters fully covered by a single L1-L4 HIGH group.
|
|
40
|
+
all = suppressKnownGroups(all, blocks, secGroup);
|
|
41
|
+
// Step 7: deterministic total order — sort by (first location FilePath,
|
|
42
|
+
// first location StartLine, ContentHash).
|
|
43
|
+
all.sort((a, b) => {
|
|
44
|
+
let aFile = "";
|
|
45
|
+
let bFile = "";
|
|
46
|
+
let aLine = 0;
|
|
47
|
+
let bLine = 0;
|
|
48
|
+
if (a.Locations.length > 0) {
|
|
49
|
+
aFile = a.Locations[0].FilePath;
|
|
50
|
+
aLine = a.Locations[0].StartLine;
|
|
51
|
+
}
|
|
52
|
+
if (b.Locations.length > 0) {
|
|
53
|
+
bFile = b.Locations[0].FilePath;
|
|
54
|
+
bLine = b.Locations[0].StartLine;
|
|
55
|
+
}
|
|
56
|
+
let c = cmpStr(aFile, bFile);
|
|
57
|
+
if (c !== 0)
|
|
58
|
+
return c;
|
|
59
|
+
c = cmpNum(aLine, bLine);
|
|
60
|
+
if (c !== 0)
|
|
61
|
+
return c;
|
|
62
|
+
return cmpStr(a.ContentHash, b.ContentHash);
|
|
63
|
+
});
|
|
64
|
+
return all;
|
|
65
|
+
}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { defaultConfig } from "../dedupcfg/index.js";
|
|
3
|
+
import { partialOverlaps } from "./partial_overlaps.js";
|
|
4
|
+
function block(filePath, sectionID, heading, startLine, endLine, contentHash) {
|
|
5
|
+
return {
|
|
6
|
+
SectionID: sectionID,
|
|
7
|
+
FilePath: filePath,
|
|
8
|
+
Heading: heading,
|
|
9
|
+
Index: 0,
|
|
10
|
+
Kind: "prose",
|
|
11
|
+
StartLine: startLine,
|
|
12
|
+
EndLine: endLine,
|
|
13
|
+
ContentHash: contentHash,
|
|
14
|
+
Text: "",
|
|
15
|
+
TableRows: 0,
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
// partialOverlaps wires the L5 pipeline together. The load-bearing behavior is
|
|
19
|
+
// the exact->cosine handoff: a verbatim cross-file block is reported ONCE as an
|
|
20
|
+
// exact cluster and is NOT re-reported as a cosine near-copy, because the exact
|
|
21
|
+
// pass feeds its hash into excludeHashes. Mirrors Go's
|
|
22
|
+
// TestAnalyze_PopulatesPartialOverlaps (here exercised directly on the primitive
|
|
23
|
+
// the orchestrator calls). Pinned because double-reporting the same block would
|
|
24
|
+
// inflate the dedup report and mislead reviewers.
|
|
25
|
+
describe("partialOverlaps", () => {
|
|
26
|
+
it("reports a verbatim cross-file block once (exact), not also as a cosine cluster", () => {
|
|
27
|
+
const cfg = defaultConfig();
|
|
28
|
+
const verbatimHash = "verbatim-block-hash-xyz";
|
|
29
|
+
const blocks = [
|
|
30
|
+
block("docs/a.md", "secA", "Overview", 3, 6, verbatimHash),
|
|
31
|
+
block("docs/b.md", "secB", "Details", 4, 7, verbatimHash),
|
|
32
|
+
];
|
|
33
|
+
// blockEmb HAS an entry for verbatimHash — it would be a cosine candidate too
|
|
34
|
+
// if the exact pass did not exclude it.
|
|
35
|
+
const blockEmb = new Map([[verbatimHash, [0.5, 0.5]]]);
|
|
36
|
+
// No HIGH groups, so nothing is suppressed.
|
|
37
|
+
const secGroup = new Map();
|
|
38
|
+
const got = partialOverlaps(blocks, blockEmb, secGroup, cfg);
|
|
39
|
+
expect(got).toHaveLength(1);
|
|
40
|
+
const cl = got[0];
|
|
41
|
+
expect(cl.Exact).toBe(true);
|
|
42
|
+
expect(cl.Kind).toBe("prose");
|
|
43
|
+
expect(cl.ContentHash).toBe(verbatimHash);
|
|
44
|
+
expect(cl.Locations).toHaveLength(2);
|
|
45
|
+
// Exact-vs-cosine dedup: exactly one exact, zero cosine for this hash.
|
|
46
|
+
const forHash = got.filter((c) => c.ContentHash === verbatimHash);
|
|
47
|
+
expect(forHash.filter((c) => c.Exact)).toHaveLength(1);
|
|
48
|
+
expect(forHash.filter((c) => !c.Exact)).toHaveLength(0);
|
|
49
|
+
});
|
|
50
|
+
// WHY: the final total order is the contract the report renderer relies on to
|
|
51
|
+
// interleave exact and cosine clusters reproducibly. With two clusters whose
|
|
52
|
+
// first locations differ by FilePath, the FilePath comparator (cmpStr) must
|
|
53
|
+
// decide the order — a regression that ignored FilePath would scramble the
|
|
54
|
+
// report between runs.
|
|
55
|
+
it("orders clusters by first-location FilePath", () => {
|
|
56
|
+
const cfg = defaultConfig();
|
|
57
|
+
// hashZ spans files starting with z..., hashA spans files starting with a...
|
|
58
|
+
const blocks = [
|
|
59
|
+
block("docs/za.md", "s1", "H", 3, 6, "hashZ"),
|
|
60
|
+
block("docs/zb.md", "s2", "H", 3, 6, "hashZ"),
|
|
61
|
+
block("docs/aa.md", "s3", "H", 3, 6, "hashA"),
|
|
62
|
+
block("docs/ab.md", "s4", "H", 3, 6, "hashA"),
|
|
63
|
+
];
|
|
64
|
+
const got = partialOverlaps(blocks, new Map(), new Map(), cfg);
|
|
65
|
+
expect(got).toHaveLength(2);
|
|
66
|
+
// hashA cluster (first loc docs/aa.md) sorts before hashZ (docs/za.md).
|
|
67
|
+
expect(got[0].Locations[0].FilePath).toBe("docs/aa.md");
|
|
68
|
+
expect(got[1].Locations[0].FilePath).toBe("docs/za.md");
|
|
69
|
+
});
|
|
70
|
+
// WHY: when two clusters share the same first-location FilePath, ordering must
|
|
71
|
+
// fall through to StartLine (cmpNum). This pins the second comparator key so a
|
|
72
|
+
// stable ordering survives same-file clusters at different lines.
|
|
73
|
+
it("breaks a FilePath tie by first-location StartLine", () => {
|
|
74
|
+
const cfg = defaultConfig();
|
|
75
|
+
// Both clusters' first location is docs/a.md; the second files differ so the
|
|
76
|
+
// cross-file requirement holds, but the first-location StartLine differs.
|
|
77
|
+
const blocks = [
|
|
78
|
+
block("docs/a.md", "s1", "H", 20, 22, "hashLate"),
|
|
79
|
+
block("docs/b.md", "s2", "H", 5, 7, "hashLate"),
|
|
80
|
+
block("docs/a.md", "s3", "H", 4, 6, "hashEarly"),
|
|
81
|
+
block("docs/c.md", "s4", "H", 9, 11, "hashEarly"),
|
|
82
|
+
];
|
|
83
|
+
const got = partialOverlaps(blocks, new Map(), new Map(), cfg);
|
|
84
|
+
expect(got).toHaveLength(2);
|
|
85
|
+
// hashEarly's first loc is docs/a.md:4 (StartLine 4 < 20) so it sorts first.
|
|
86
|
+
expect(got[0].ContentHash).toBe("hashEarly");
|
|
87
|
+
expect(got[0].Locations[0].StartLine).toBe(4);
|
|
88
|
+
expect(got[1].ContentHash).toBe("hashLate");
|
|
89
|
+
expect(got[1].Locations[0].StartLine).toBe(20);
|
|
90
|
+
});
|
|
91
|
+
// WHY: when first FilePath AND StartLine are equal, the final tiebreak is
|
|
92
|
+
// ContentHash (cmpStr). Two distinct verbatim blocks that happen to start at
|
|
93
|
+
// the same file+line must still be totally ordered, deterministically, by hash.
|
|
94
|
+
it("breaks a FilePath+StartLine tie by ContentHash", () => {
|
|
95
|
+
const cfg = defaultConfig();
|
|
96
|
+
// Construct two exact clusters whose sorted-first location is identical
|
|
97
|
+
// (docs/a.md, line 3). hashB's locations are a.md:3 + b.md:3; hashA's are
|
|
98
|
+
// a.md:3 + c.md:3. After per-cluster location sort, both lead with a.md:3.
|
|
99
|
+
const blocks = [
|
|
100
|
+
block("docs/a.md", "s1", "H", 3, 6, "hashB"),
|
|
101
|
+
block("docs/b.md", "s2", "H", 3, 6, "hashB"),
|
|
102
|
+
block("docs/a.md", "s3", "H", 3, 6, "hashA"),
|
|
103
|
+
block("docs/c.md", "s4", "H", 3, 6, "hashA"),
|
|
104
|
+
];
|
|
105
|
+
const got = partialOverlaps(blocks, new Map(), new Map(), cfg);
|
|
106
|
+
expect(got).toHaveLength(2);
|
|
107
|
+
expect(got[0].Locations[0].FilePath).toBe("docs/a.md");
|
|
108
|
+
expect(got[1].Locations[0].FilePath).toBe("docs/a.md");
|
|
109
|
+
expect(got[0].Locations[0].StartLine).toBe(3);
|
|
110
|
+
expect(got[1].Locations[0].StartLine).toBe(3);
|
|
111
|
+
// Tie resolved by ContentHash: "hashA" < "hashB".
|
|
112
|
+
expect(got[0].ContentHash).toBe("hashA");
|
|
113
|
+
expect(got[1].ContentHash).toBe("hashB");
|
|
114
|
+
});
|
|
115
|
+
// WHY: an exact cluster can carry an empty ContentHash (when blocks share the
|
|
116
|
+
// empty-string hash across files). The excludeHashes builder must NOT add ""
|
|
117
|
+
// to the exclude set, otherwise every cosine candidate lacking a hash would be
|
|
118
|
+
// wrongly suppressed. This pins the `ContentHash !== ""` guard.
|
|
119
|
+
it("does not add an empty ContentHash to the cosine exclude set", () => {
|
|
120
|
+
const cfg = defaultConfig();
|
|
121
|
+
// Two cross-file blocks with empty ContentHash -> one exact cluster, hash "".
|
|
122
|
+
// A separate cross-file prose pair has DISTINCT hashes but near-1.0 cosine
|
|
123
|
+
// embeddings -> it must still surface as a cosine cluster (proving "" did not
|
|
124
|
+
// poison the exclude set / suppress unrelated cosine candidates).
|
|
125
|
+
const blocks = [
|
|
126
|
+
block("docs/a.md", "s1", "H", 1, 2, ""),
|
|
127
|
+
block("docs/b.md", "s2", "H", 1, 2, ""),
|
|
128
|
+
block("docs/c.md", "s3", "H", 10, 12, "cosHashC"),
|
|
129
|
+
block("docs/d.md", "s4", "H", 10, 12, "cosHashD"),
|
|
130
|
+
];
|
|
131
|
+
const blockEmb = new Map([
|
|
132
|
+
["cosHashC", [1, 0]],
|
|
133
|
+
["cosHashD", [1, 0]],
|
|
134
|
+
]);
|
|
135
|
+
const got = partialOverlaps(blocks, blockEmb, new Map(), cfg);
|
|
136
|
+
const emptyExact = got.filter((c) => c.Exact && c.ContentHash === "");
|
|
137
|
+
expect(emptyExact).toHaveLength(1);
|
|
138
|
+
// The real-hash cosine cluster survived (its hash was never excluded).
|
|
139
|
+
const cosine = got.filter((c) => !c.Exact);
|
|
140
|
+
expect(cosine).toHaveLength(1);
|
|
141
|
+
expect(cosine[0].Kind).toBe("prose");
|
|
142
|
+
});
|
|
143
|
+
// WHY: suppression must drop a cluster whose every location lives in the same
|
|
144
|
+
// L1-L4 HIGH group, so L5 only surfaces NOVEL overlaps. Without this, a block
|
|
145
|
+
// already reported via its section group would double-report at the block level.
|
|
146
|
+
it("suppresses a cluster fully covered by one HIGH group", () => {
|
|
147
|
+
const cfg = defaultConfig();
|
|
148
|
+
const h = "covered-hash";
|
|
149
|
+
const blocks = [
|
|
150
|
+
block("docs/a.md", "secA", "H", 3, 6, h),
|
|
151
|
+
block("docs/b.md", "secB", "H", 3, 6, h),
|
|
152
|
+
];
|
|
153
|
+
// Both sections are members of the same HIGH group (id 0) -> fully covered.
|
|
154
|
+
const secGroup = new Map([
|
|
155
|
+
["secA", 0],
|
|
156
|
+
["secB", 0],
|
|
157
|
+
]);
|
|
158
|
+
const got = partialOverlaps(blocks, new Map(), secGroup, cfg);
|
|
159
|
+
expect(got).toHaveLength(0);
|
|
160
|
+
});
|
|
161
|
+
});
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
// Ported from internal/dedup/analyzer/preview.go.
|
|
2
|
+
//
|
|
3
|
+
// Prose-only preview extraction for the report renderer. The regex set is
|
|
4
|
+
// parity-load-bearing (plan §791): they are package-level constants, not Config,
|
|
5
|
+
// and the cleanup order matters. Go's RE2 `\s` is the ASCII set [\t\n\f\r ]
|
|
6
|
+
// (NOT \v, NOT Unicode spaces), so every `\s` is spelled out explicitly here —
|
|
7
|
+
// JS `\s` would match more and drift the output.
|
|
8
|
+
/** (?s)```.*?``` — fenced code blocks (dotAll). */
|
|
9
|
+
const codeFenceRE = /```[\s\S]*?```/g;
|
|
10
|
+
/** (?m)(?:^[ \t]{4,}.*\n?)+ — indented code blocks. */
|
|
11
|
+
const indentedCodeRE = /(?:^[ \t]{4,}.*\n?)+/gm;
|
|
12
|
+
/**  — keep alt text. */
|
|
13
|
+
const imageRE = /!\[([^\]]*)\]\([^)]+\)/g;
|
|
14
|
+
/** [label](url) — keep label text. */
|
|
15
|
+
const linkRE = /\[([^\]]+)\]\([^)]+\)/g;
|
|
16
|
+
/** `code` — keep inner text. */
|
|
17
|
+
const inlineCodeRE = /`([^`]+)`/g;
|
|
18
|
+
/** (?m)^\s*(?:[-+*]|\d+\.|>+)\s+ — leading list / blockquote markers. */
|
|
19
|
+
const leadMarkerRE = /^[\t\n\f\r ]*(?:[-+*]|\d+\.|>+)[\t\n\f\r ]+/gm;
|
|
20
|
+
/** \*{1,3}([^*\n]+)\*{1,3} — bold/italic with asterisks. */
|
|
21
|
+
const boldItalAstRE = /\*{1,3}([^*\n]+)\*{1,3}/g;
|
|
22
|
+
/** _{1,3}([^_\n]+)_{1,3} — bold/italic with underscores. */
|
|
23
|
+
const boldItalUndRE = /_{1,3}([^_\n]+)_{1,3}/g;
|
|
24
|
+
/** \s+ — whitespace collapse (Go ASCII \s). */
|
|
25
|
+
const whitespaceRE = /[\t\n\f\r ]+/g;
|
|
26
|
+
/**
|
|
27
|
+
* computePreview extracts a clean prose-only preview from a section's raw markdown.
|
|
28
|
+
*
|
|
29
|
+
* It drops the section's heading line (if the first line starts with #), strips
|
|
30
|
+
* fenced and indented code blocks, image syntax, link URLs (keeping label text),
|
|
31
|
+
* list/blockquote markers, and emphasis markers. Truncates at maxChars on a word
|
|
32
|
+
* boundary, appending "…" if truncated. Wrapping is the renderer's concern.
|
|
33
|
+
*/
|
|
34
|
+
export function computePreview(rawContent, maxChars, wordRatio) {
|
|
35
|
+
let text = rawContent;
|
|
36
|
+
// Drop the entire first line if it's a heading line (## …).
|
|
37
|
+
const idx = text.indexOf("\n");
|
|
38
|
+
if (idx >= 0) {
|
|
39
|
+
// Go: strings.TrimLeft(text[:idx], " \t") — left-trim only space/tab.
|
|
40
|
+
const first = trimLeftSpaceTab(text.slice(0, idx));
|
|
41
|
+
if (first.startsWith("#")) {
|
|
42
|
+
text = text.slice(idx + 1);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// Strip code blocks before anything else.
|
|
46
|
+
text = text.replace(codeFenceRE, " ");
|
|
47
|
+
text = text.replace(indentedCodeRE, " ");
|
|
48
|
+
// Markdown link/image cleanup.
|
|
49
|
+
text = text.replace(imageRE, "$1");
|
|
50
|
+
text = text.replace(linkRE, "$1");
|
|
51
|
+
text = text.replace(inlineCodeRE, "$1");
|
|
52
|
+
// Strip leading list and blockquote markers per line.
|
|
53
|
+
text = text.replace(leadMarkerRE, "");
|
|
54
|
+
// Strip bold/italic emphasis.
|
|
55
|
+
text = text.replace(boldItalAstRE, "$1");
|
|
56
|
+
text = text.replace(boldItalUndRE, "$1");
|
|
57
|
+
// Collapse whitespace.
|
|
58
|
+
text = text.replace(whitespaceRE, " ").trim();
|
|
59
|
+
if (text === "") {
|
|
60
|
+
return "(no prose content)";
|
|
61
|
+
}
|
|
62
|
+
// Go: utf8.RuneCountInString — rune count, not UTF-16 length.
|
|
63
|
+
const runes = [...text];
|
|
64
|
+
const runeLen = runes.length;
|
|
65
|
+
if (runeLen <= maxChars) {
|
|
66
|
+
return text;
|
|
67
|
+
}
|
|
68
|
+
// Truncate at word boundary.
|
|
69
|
+
let cut = runes.slice(0, maxChars).join("");
|
|
70
|
+
const lastSpace = cut.lastIndexOf(" ");
|
|
71
|
+
// Go: int(float64(maxChars)*wordRatio) — truncate toward zero.
|
|
72
|
+
if (lastSpace > Math.trunc(maxChars * wordRatio)) {
|
|
73
|
+
cut = cut.slice(0, lastSpace);
|
|
74
|
+
}
|
|
75
|
+
return cut + "…";
|
|
76
|
+
}
|
|
77
|
+
/** Left-trims only ASCII space and tab (Go strings.TrimLeft(s, " \t")). */
|
|
78
|
+
function trimLeftSpaceTab(s) {
|
|
79
|
+
let i = 0;
|
|
80
|
+
while (i < s.length && (s[i] === " " || s[i] === "\t")) {
|
|
81
|
+
i++;
|
|
82
|
+
}
|
|
83
|
+
return s.slice(i);
|
|
84
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { computePreview } from "./preview.js";
|
|
3
|
+
// The preview is the human-facing snippet a reviewer reads to decide whether two
|
|
4
|
+
// sections are truly duplicates. It must show PROSE only — heading lines and code
|
|
5
|
+
// fences would make unrelated sections look alike (or alike sections look
|
|
6
|
+
// different), so each strip rule is pinned by WHY it matters, plus the
|
|
7
|
+
// word-boundary truncation contract.
|
|
8
|
+
describe("computePreview", () => {
|
|
9
|
+
it("strips the leading heading line", () => {
|
|
10
|
+
const raw = "## My Heading\n\nThis is the prose content of the section.";
|
|
11
|
+
const got = computePreview(raw, 280, 0.6);
|
|
12
|
+
expect(got).not.toContain("## My Heading");
|
|
13
|
+
expect(got).toContain("This is the prose content");
|
|
14
|
+
});
|
|
15
|
+
it("strips fenced code blocks (code is noise for prose comparison)", () => {
|
|
16
|
+
const raw = "## Heading\n\nSome prose.\n\n```go\nfunc main() {}\n```\n\nMore prose.";
|
|
17
|
+
const got = computePreview(raw, 280, 0.6);
|
|
18
|
+
expect(got).not.toContain("func main");
|
|
19
|
+
expect(got).toContain("Some prose");
|
|
20
|
+
});
|
|
21
|
+
it("truncates at a word boundary and appends an ellipsis", () => {
|
|
22
|
+
// WHY: cutting mid-word would corrupt the snippet; the ellipsis signals
|
|
23
|
+
// truncation. Build > 280 chars of repeated whole words.
|
|
24
|
+
let body = "";
|
|
25
|
+
while (body.length < 400) {
|
|
26
|
+
body += "documentation ";
|
|
27
|
+
}
|
|
28
|
+
const raw = `## Heading\n\n${body}`;
|
|
29
|
+
const got = computePreview(raw, 280, 0.6);
|
|
30
|
+
// A little slack for the ellipsis character.
|
|
31
|
+
expect([...got].length).toBeLessThanOrEqual(285);
|
|
32
|
+
expect(got.endsWith("…")).toBe(true);
|
|
33
|
+
});
|
|
34
|
+
it("does not truncate short content", () => {
|
|
35
|
+
const raw = "## Heading\n\nShort content.";
|
|
36
|
+
const got = computePreview(raw, 280, 0.6);
|
|
37
|
+
expect(got.endsWith("…")).toBe(false);
|
|
38
|
+
});
|
|
39
|
+
it("returns a placeholder when no prose remains after stripping", () => {
|
|
40
|
+
// WHY: a code-only section has no comparable prose; the placeholder keeps the
|
|
41
|
+
// report readable instead of showing an empty snippet.
|
|
42
|
+
const raw = "## Heading\n\n```go\nfunc main() {}\n```";
|
|
43
|
+
const got = computePreview(raw, 280, 0.6);
|
|
44
|
+
expect(got).toBe("(no prose content)");
|
|
45
|
+
});
|
|
46
|
+
});
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
// Ported from internal/dedup/analyzer/safety.go.
|
|
2
|
+
//
|
|
3
|
+
// Differentiator detection for the L4 safety net: differentiators NEVER block
|
|
4
|
+
// group membership; they force recommended_action = manual_review and annotate
|
|
5
|
+
// the group with a reason tag.
|
|
6
|
+
/**
|
|
7
|
+
* findDifferentiators checks for differentiator word-pair presence in two
|
|
8
|
+
* section texts. If section A contains one side of a pair and section B contains
|
|
9
|
+
* the other (case-insensitive substring match), a reason tag is added to the
|
|
10
|
+
* returned slice.
|
|
11
|
+
*
|
|
12
|
+
* @param textA raw text of the first section
|
|
13
|
+
* @param textB raw text of the second section
|
|
14
|
+
* @param differentiators list of [left, right] word pairs (Go [2]string)
|
|
15
|
+
*/
|
|
16
|
+
export function findDifferentiators(textA, textB, differentiators) {
|
|
17
|
+
const a = textA.toLowerCase();
|
|
18
|
+
const b = textB.toLowerCase();
|
|
19
|
+
const hits = [];
|
|
20
|
+
for (const [left, right] of differentiators) {
|
|
21
|
+
if ((a.includes(left) && b.includes(right)) ||
|
|
22
|
+
(a.includes(right) && b.includes(left))) {
|
|
23
|
+
hits.push(`differentiator detected: '${left}' vs '${right}'`);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
return hits;
|
|
27
|
+
}
|