docsgov 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +242 -0
- package/dist/apispec/apispec.js +401 -0
- package/dist/apispec/apispec.test.js +444 -0
- package/dist/apispec/errors.js +17 -0
- package/dist/apispec/index.js +2 -0
- package/dist/check/doclinks.js +167 -0
- package/dist/check/index.js +8 -0
- package/dist/check/run.js +391 -0
- package/dist/check/run.test.js +513 -0
- package/dist/check/suggest.js +134 -0
- package/dist/check/suggest.test.js +92 -0
- package/dist/check/tokens.js +125 -0
- package/dist/cmd/main.js +330 -0
- package/dist/cmd/main.test.js +422 -0
- package/dist/codeq/cache.js +71 -0
- package/dist/codeq/cache.test.js +67 -0
- package/dist/codeq/errors.js +52 -0
- package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
- package/dist/codeq/index.js +11 -0
- package/dist/codeq/resolve.test.js +109 -0
- package/dist/codeq/resolver.js +128 -0
- package/dist/codeq/resolver.test.js +124 -0
- package/dist/codeq/resolvers/go.js +242 -0
- package/dist/codeq/resolvers/go.test.js +143 -0
- package/dist/codeq/resolvers/java.js +349 -0
- package/dist/codeq/resolvers/java.test.js +138 -0
- package/dist/codeq/resolvers/java_queries.js +63 -0
- package/dist/codeq/resolvers/javascript.js +412 -0
- package/dist/codeq/resolvers/javascript.test.js +125 -0
- package/dist/codeq/resolvers/javascript_queries.js +46 -0
- package/dist/codeq/resolvers/typescript.js +366 -0
- package/dist/codeq/resolvers/typescript.test.js +180 -0
- package/dist/codeq/resolvers/typescript_queries.js +78 -0
- package/dist/codeq/signature.js +50 -0
- package/dist/codeq/signature.test.js +50 -0
- package/dist/codeq/suggest.js +96 -0
- package/dist/codeq/treesitter.js +122 -0
- package/dist/codeq/treesitter.test.js +118 -0
- package/dist/config/config.js +74 -0
- package/dist/config/config.test.js +98 -0
- package/dist/config/fs.js +116 -0
- package/dist/config/glob.js +82 -0
- package/dist/config/glob.test.js +61 -0
- package/dist/config/index.js +4 -0
- package/dist/dedup/analyzer/analyzer.js +533 -0
- package/dist/dedup/analyzer/analyzer.test.js +530 -0
- package/dist/dedup/analyzer/canonical.js +74 -0
- package/dist/dedup/analyzer/canonical.test.js +70 -0
- package/dist/dedup/analyzer/cosine_clusters.js +169 -0
- package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
- package/dist/dedup/analyzer/distinctive.js +85 -0
- package/dist/dedup/analyzer/distinctive.test.js +49 -0
- package/dist/dedup/analyzer/exact_clusters.js +63 -0
- package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
- package/dist/dedup/analyzer/index.js +14 -0
- package/dist/dedup/analyzer/multiplicity.js +110 -0
- package/dist/dedup/analyzer/multiplicity.test.js +123 -0
- package/dist/dedup/analyzer/order.js +22 -0
- package/dist/dedup/analyzer/partial_overlaps.js +65 -0
- package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
- package/dist/dedup/analyzer/preview.js +84 -0
- package/dist/dedup/analyzer/preview.test.js +46 -0
- package/dist/dedup/analyzer/safety.js +27 -0
- package/dist/dedup/analyzer/safety.test.js +39 -0
- package/dist/dedup/config.js +18 -0
- package/dist/dedup/configload.js +299 -0
- package/dist/dedup/configload.test.js +410 -0
- package/dist/dedup/dedup.index.test.js +203 -0
- package/dist/dedup/dedup.js +143 -0
- package/dist/dedup/dedup.test.js +212 -0
- package/dist/dedup/dedupcfg/config.js +112 -0
- package/dist/dedup/dedupcfg/config.test.js +70 -0
- package/dist/dedup/dedupcfg/index.js +1 -0
- package/dist/dedup/deduptypes/index.js +1 -0
- package/dist/dedup/deduptypes/types.js +9 -0
- package/dist/dedup/deduptypes/types.test.js +34 -0
- package/dist/dedup/embedder/cache.js +23 -0
- package/dist/dedup/embedder/cache.test.js +50 -0
- package/dist/dedup/embedder/constants.js +10 -0
- package/dist/dedup/embedder/embedder.js +76 -0
- package/dist/dedup/embedder/embedder.mock.test.js +128 -0
- package/dist/dedup/embedder/embedder.test.js +96 -0
- package/dist/dedup/embedder/errors.js +20 -0
- package/dist/dedup/embedder/errors.test.js +35 -0
- package/dist/dedup/embedder/index.js +4 -0
- package/dist/dedup/embedder/session.js +78 -0
- package/dist/dedup/embedder/session.test.js +172 -0
- package/dist/dedup/gitignore.js +97 -0
- package/dist/dedup/gitignore.test.js +98 -0
- package/dist/dedup/index.js +11 -0
- package/dist/dedup/indexdb/errors.js +48 -0
- package/dist/dedup/indexdb/index.js +6 -0
- package/dist/dedup/indexdb/indexdb.js +302 -0
- package/dist/dedup/indexdb/indexdb.test.js +739 -0
- package/dist/dedup/indexdb/load.js +110 -0
- package/dist/dedup/indexdb/migrations.js +58 -0
- package/dist/dedup/indexdb/schema.js +83 -0
- package/dist/dedup/indexer/index.js +9 -0
- package/dist/dedup/indexer/indexer.js +501 -0
- package/dist/dedup/indexer/indexer.test.js +510 -0
- package/dist/dedup/indexer/links.js +89 -0
- package/dist/dedup/mdsection/anchor.js +60 -0
- package/dist/dedup/mdsection/anchor.test.js +39 -0
- package/dist/dedup/mdsection/blocks.js +409 -0
- package/dist/dedup/mdsection/blocks.test.js +359 -0
- package/dist/dedup/mdsection/index.js +4 -0
- package/dist/dedup/mdsection/parse.js +21 -0
- package/dist/dedup/mdsection/section.js +234 -0
- package/dist/dedup/mdsection/section.test.js +221 -0
- package/dist/dedup/report/floatfmt.js +71 -0
- package/dist/dedup/report/floatfmt.test.js +42 -0
- package/dist/dedup/report/index.js +8 -0
- package/dist/dedup/report/quote.js +77 -0
- package/dist/dedup/report/quote.test.js +67 -0
- package/dist/dedup/report/text.js +251 -0
- package/dist/dedup/report/text.test.js +420 -0
- package/dist/dedup/report_types.js +8 -0
- package/dist/dedup/sectionid/index.js +1 -0
- package/dist/dedup/sectionid/sectionid.js +16 -0
- package/dist/dedup/sectionid/sectionid.test.js +49 -0
- package/dist/guard/api/errors.js +12 -0
- package/dist/guard/api/index.js +2 -0
- package/dist/guard/api/parser.js +81 -0
- package/dist/guard/api/parser.test.js +58 -0
- package/dist/guard/api/types.js +1 -0
- package/dist/guard/code/errors.js +16 -0
- package/dist/guard/code/index.js +2 -0
- package/dist/guard/code/parser.js +54 -0
- package/dist/guard/code/parser.test.js +111 -0
- package/dist/guard/code/types.js +6 -0
- package/dist/index.js +1 -0
- package/dist/index.test.js +5 -0
- package/dist/repo/boundary.js +92 -0
- package/dist/repo/boundary.test.js +65 -0
- package/dist/repo/errors.js +56 -0
- package/dist/repo/errors.test.js +85 -0
- package/dist/repo/exists.test.js +72 -0
- package/dist/repo/filename.js +46 -0
- package/dist/repo/filename.test.js +39 -0
- package/dist/repo/fs.js +53 -0
- package/dist/repo/index.js +7 -0
- package/dist/repo/overlay.js +36 -0
- package/dist/repo/overlay.test.js +80 -0
- package/dist/repo/repo.js +353 -0
- package/dist/repo/repo.test.js +255 -0
- package/dist/repo/testutil.js +27 -0
- package/dist/repo/write.test.js +125 -0
- package/dist/report/color.js +73 -0
- package/dist/report/index.js +1 -0
- package/dist/report/report.js +112 -0
- package/dist/report/report.test.js +368 -0
- package/dist/violation/index.js +1 -0
- package/dist/violation/types.js +22 -0
- package/dist/violation/types.test.js +70 -0
- package/package.json +48 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
// Port of internal/config/glob.go.
|
|
2
|
+
//
|
|
3
|
+
// Go uses github.com/bmatcuk/doublestar; we use picomatch with { dot: true },
|
|
4
|
+
// which matches doublestar's semantics for the patterns docgov uses:
|
|
5
|
+
// - "**" matches zero or more path segments (so "docs/**" matches "docs"),
|
|
6
|
+
// - dotfiles are matched (doublestar matches them by default),
|
|
7
|
+
// - a malformed pattern contributes no match (doublestar returns an error
|
|
8
|
+
// that InScope skips; picomatch simply does not match it).
|
|
9
|
+
import picomatch from "picomatch";
|
|
10
|
+
import { isNotExist } from "./fs.js";
|
|
11
|
+
/**
|
|
12
|
+
* normalizePattern converts a trailing-slash pattern ("docs/") to a recursive
|
|
13
|
+
* glob ("docs/**") so callers don't need to special-case directory globs.
|
|
14
|
+
* Patterns without a trailing slash are returned unchanged.
|
|
15
|
+
*/
|
|
16
|
+
function normalizePattern(p) {
|
|
17
|
+
if (p.endsWith("/")) {
|
|
18
|
+
return p + "**";
|
|
19
|
+
}
|
|
20
|
+
return p;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* inScope returns true if slashPath matches ANY of the given glob patterns.
|
|
24
|
+
* Each pattern is normalized by normalizePattern before matching. A pattern
|
|
25
|
+
* that fails to compile contributes no match.
|
|
26
|
+
*/
|
|
27
|
+
export function inScope(patterns, slashPath) {
|
|
28
|
+
for (const p of patterns) {
|
|
29
|
+
let matcher;
|
|
30
|
+
try {
|
|
31
|
+
matcher = picomatch(normalizePattern(p), { dot: true });
|
|
32
|
+
}
|
|
33
|
+
catch {
|
|
34
|
+
// A pattern that does not compile contributes no match (mirrors the Go
|
|
35
|
+
// code skipping a doublestar.Match error).
|
|
36
|
+
continue;
|
|
37
|
+
}
|
|
38
|
+
if (matcher(slashPath)) {
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return false;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* walkMarkdown walks fsys from the root, collects all ".md" files whose slash
|
|
46
|
+
* path satisfies inScope(patterns, path), and returns them sorted.
|
|
47
|
+
*
|
|
48
|
+
* It descends the async {@link FS} via readDir/sub (the unified repo FS). A
|
|
49
|
+
* missing directory at any point is treated as empty, matching Go's fs.WalkDir
|
|
50
|
+
* skipping a tree that does not exist.
|
|
51
|
+
*/
|
|
52
|
+
export async function walkMarkdown(fsys, patterns) {
|
|
53
|
+
const results = [];
|
|
54
|
+
async function descend(dirSlash) {
|
|
55
|
+
let entries;
|
|
56
|
+
try {
|
|
57
|
+
entries = await fsys.readDir(dirSlash === "" ? "." : dirSlash);
|
|
58
|
+
}
|
|
59
|
+
catch (err) {
|
|
60
|
+
if (isNotExist(err)) {
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
throw err;
|
|
64
|
+
}
|
|
65
|
+
for (const entry of entries) {
|
|
66
|
+
const childSlash = dirSlash === "" ? entry.name() : `${dirSlash}/${entry.name()}`;
|
|
67
|
+
if (entry.isDir()) {
|
|
68
|
+
await descend(childSlash);
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
if (!childSlash.endsWith(".md")) {
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
if (inScope(patterns, childSlash)) {
|
|
75
|
+
results.push(childSlash);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
await descend("");
|
|
80
|
+
results.sort();
|
|
81
|
+
return results;
|
|
82
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import { MapFS } from "./fs.js";
|
|
3
|
+
import { inScope, walkMarkdown } from "./glob.js";
|
|
4
|
+
describe("inScope", () => {
|
|
5
|
+
// A trailing "/" must be normalized to "/**" so a directory pattern matches
|
|
6
|
+
// at any depth — this is the ergonomic shorthand the config grammar promises.
|
|
7
|
+
it("treats a trailing-slash pattern as matching all depths", () => {
|
|
8
|
+
const patterns = ["docs/"];
|
|
9
|
+
expect(inScope(patterns, "docs/x.md")).toBe(true);
|
|
10
|
+
expect(inScope(patterns, "docs/a/b.md")).toBe(true);
|
|
11
|
+
expect(inScope(patterns, "other/x.md")).toBe(false);
|
|
12
|
+
});
|
|
13
|
+
// A single "*" must match only direct children, NOT deeper paths — authors
|
|
14
|
+
// rely on this to scope a guard to one directory level.
|
|
15
|
+
it("matches direct children only for a single-star pattern", () => {
|
|
16
|
+
const patterns = ["docs/api/*"];
|
|
17
|
+
expect(inScope(patterns, "docs/api/teams.md")).toBe(true);
|
|
18
|
+
expect(inScope(patterns, "docs/api/v1/teams.md")).toBe(false);
|
|
19
|
+
});
|
|
20
|
+
// "**" must match at any depth.
|
|
21
|
+
it("matches any depth for a double-star pattern", () => {
|
|
22
|
+
expect(inScope(["src/**"], "src/a/b.go")).toBe(true);
|
|
23
|
+
});
|
|
24
|
+
// A path matching none of the patterns must be out of scope.
|
|
25
|
+
it("returns false when no pattern matches", () => {
|
|
26
|
+
expect(inScope(["docs/**", "internal/**"], "cmd/docgov/main.go")).toBe(false);
|
|
27
|
+
});
|
|
28
|
+
// No patterns means nothing is ever in scope.
|
|
29
|
+
it("returns false for an empty pattern list", () => {
|
|
30
|
+
expect(inScope([], "docs/x.md")).toBe(false);
|
|
31
|
+
});
|
|
32
|
+
// A malformed pattern must contribute no match (and must not throw), so one
|
|
33
|
+
// bad glob can't take down a whole check run.
|
|
34
|
+
it("contributes no match for a malformed pattern", () => {
|
|
35
|
+
expect(inScope(["["], "docs/x.md")).toBe(false);
|
|
36
|
+
});
|
|
37
|
+
});
|
|
38
|
+
describe("walkMarkdown", () => {
|
|
39
|
+
// Returns only in-scope .md files, sorted, excluding non-.md and
|
|
40
|
+
// out-of-scope files — this is the file set every guard pass iterates.
|
|
41
|
+
it("returns matching .md files sorted, excluding non-md and out-of-scope", async () => {
|
|
42
|
+
const fsys = new MapFS({
|
|
43
|
+
"docs/index.md": "# index",
|
|
44
|
+
"docs/api/teams.md": "# teams",
|
|
45
|
+
"docs/api/readme.txt": "not md",
|
|
46
|
+
"internal/foo.go": "package foo",
|
|
47
|
+
"other/page.md": "# other",
|
|
48
|
+
});
|
|
49
|
+
const got = await walkMarkdown(fsys, ["docs/**"]);
|
|
50
|
+
expect(got).toEqual(["docs/api/teams.md", "docs/index.md"]);
|
|
51
|
+
});
|
|
52
|
+
// Files outside the pattern are excluded even when they are .md files.
|
|
53
|
+
it("excludes .md files outside the boundary pattern", async () => {
|
|
54
|
+
const fsys = new MapFS({
|
|
55
|
+
"docs/index.md": "# index",
|
|
56
|
+
"other/page.md": "# other",
|
|
57
|
+
});
|
|
58
|
+
const got = await walkMarkdown(fsys, ["docs/**"]);
|
|
59
|
+
expect(got).not.toContain("other/page.md");
|
|
60
|
+
});
|
|
61
|
+
});
|
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
// Ported from internal/dedup/analyzer/analyzer.go.
|
|
2
|
+
//
|
|
3
|
+
// The layered duplicate-detection orchestrator. It takes a flat list of sections
|
|
4
|
+
// plus their L2-normalized embeddings and returns a Report containing
|
|
5
|
+
// high-confidence duplicate groups, a flat list of possible pairs, and (when
|
|
6
|
+
// blocks are supplied) block-level partial overlaps.
|
|
7
|
+
//
|
|
8
|
+
// Algorithm layers (applied in order):
|
|
9
|
+
//
|
|
10
|
+
// L1 — Exact: union sections with equal content_hash (including same-file).
|
|
11
|
+
// L2 — Heading: cosine pair in [thresh_maybe, thresh_high) with >=1 shared
|
|
12
|
+
// distinctive heading token -> promote to HIGH (union).
|
|
13
|
+
// L3 — Cosine: cosine of L2-normalized embeddings; skip same-file pairs.
|
|
14
|
+
// >=thresh_high -> union (HIGH). [thresh_maybe, thresh_high) ->
|
|
15
|
+
// "possible pair" unless promoted by L2. <thresh_maybe -> ignored.
|
|
16
|
+
// L4 — Safety: differentiator word-pair detection. Never blocks grouping;
|
|
17
|
+
// forces recommended_action = manual_review and adds reason.
|
|
18
|
+
// L5 — Blocks: partial-overlap detection over BlockRecords (optional).
|
|
19
|
+
//
|
|
20
|
+
// Determinism: Go's `pairs` and `groupsIdx` maps are iterated then sorted with a
|
|
21
|
+
// NON-stable slices.SortFunc, so within equal sort keys the order is
|
|
22
|
+
// unspecified. To make the TS output reproducible AND match Go on the cases the
|
|
23
|
+
// tests pin, every map iteration here is done in a fixed key order (numeric
|
|
24
|
+
// (i,j) for pairs, group-membership order for groups) before the same
|
|
25
|
+
// comparators run.
|
|
26
|
+
import { canonicalRank, isDisqualified, lessRank, } from "./canonical.js";
|
|
27
|
+
import { buildDistinctiveFilter, distinctiveTokensOf, } from "./distinctive.js";
|
|
28
|
+
import { partialOverlaps } from "./partial_overlaps.js";
|
|
29
|
+
import { computePreview } from "./preview.js";
|
|
30
|
+
import { findDifferentiators } from "./safety.js";
|
|
31
|
+
/** pairKey builds the canonical "i,j" map key for a pair, with i < j. */
|
|
32
|
+
function pairKey(a, b) {
|
|
33
|
+
const i = Math.min(a, b);
|
|
34
|
+
const j = Math.max(a, b);
|
|
35
|
+
return `${i},${j}`;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Analyze runs the four-layer duplicate-detection algorithm over sections and
|
|
39
|
+
* their L2-normalized embeddings, returning a structured Report.
|
|
40
|
+
*
|
|
41
|
+
* embeddings maps section ID -> L2-normalized vector. Sections with missing
|
|
42
|
+
* embeddings are treated as zero vectors (should not occur in practice).
|
|
43
|
+
*
|
|
44
|
+
* blocks is the flat list of BlockRecords for L5 partial-overlap detection.
|
|
45
|
+
* blockEmb maps content_hash -> L2-normalized block embedding vector. Pass
|
|
46
|
+
* empty/undefined for both to skip L5 (PartialOverlaps will be empty).
|
|
47
|
+
*/
|
|
48
|
+
export function analyze(sections, embeddings, blocks, blockEmb, cfg) {
|
|
49
|
+
const n = sections.length;
|
|
50
|
+
if (n < 2) {
|
|
51
|
+
return { HighGroups: [], MaybePairs: [], PartialOverlaps: [] };
|
|
52
|
+
}
|
|
53
|
+
const acfg = cfg.Analyzer;
|
|
54
|
+
const rcfg = cfg.Report;
|
|
55
|
+
// Build per-section embedding slice for fast dot-product access.
|
|
56
|
+
const vecs = new Array(n);
|
|
57
|
+
for (let i = 0; i < n; i++) {
|
|
58
|
+
vecs[i] = embeddings.get(sections[i].id);
|
|
59
|
+
}
|
|
60
|
+
// Compute corpus-level distinctive token filter.
|
|
61
|
+
const headings = new Array(n);
|
|
62
|
+
for (let i = 0; i < n; i++) {
|
|
63
|
+
headings[i] = sections[i].heading;
|
|
64
|
+
}
|
|
65
|
+
let minTokLen = cfg.Markdown.heading_token_min_len;
|
|
66
|
+
if (minTokLen <= 0) {
|
|
67
|
+
minTokLen = 3;
|
|
68
|
+
}
|
|
69
|
+
const distinctiveFilter = buildDistinctiveFilter(headings, acfg, minTokLen);
|
|
70
|
+
// Per-section distinctive token sets.
|
|
71
|
+
const secDistTokens = new Array(n);
|
|
72
|
+
for (let i = 0; i < n; i++) {
|
|
73
|
+
secDistTokens[i] = distinctiveTokensOf(sections[i].heading, distinctiveFilter, acfg, minTokLen);
|
|
74
|
+
}
|
|
75
|
+
// Union-find for HIGH grouping.
|
|
76
|
+
const parent = new Array(n);
|
|
77
|
+
for (let i = 0; i < n; i++) {
|
|
78
|
+
parent[i] = i;
|
|
79
|
+
}
|
|
80
|
+
const find = (x) => {
|
|
81
|
+
while (parent[x] !== x) {
|
|
82
|
+
parent[x] = parent[parent[x]]; // path compression
|
|
83
|
+
x = parent[x];
|
|
84
|
+
}
|
|
85
|
+
return x;
|
|
86
|
+
};
|
|
87
|
+
const union = (a, b) => {
|
|
88
|
+
const ra = find(a);
|
|
89
|
+
const rb = find(b);
|
|
90
|
+
if (ra !== rb) {
|
|
91
|
+
parent[ra] = rb;
|
|
92
|
+
}
|
|
93
|
+
};
|
|
94
|
+
const pairs = new Map();
|
|
95
|
+
// ----- L1: Exact content_hash match (including same-file) -----
|
|
96
|
+
// hashBuckets keyed by content_hash; insertion-order map iteration mirrors
|
|
97
|
+
// Go's per-bucket inner loops (only union + pair-set, no ordering effect).
|
|
98
|
+
const hashBuckets = new Map();
|
|
99
|
+
for (let i = 0; i < n; i++) {
|
|
100
|
+
const h = sections[i].content_hash;
|
|
101
|
+
const bucket = hashBuckets.get(h);
|
|
102
|
+
if (bucket) {
|
|
103
|
+
bucket.push(i);
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
hashBuckets.set(h, [i]);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
for (const idxs of hashBuckets.values()) {
|
|
110
|
+
if (idxs.length < 2) {
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
for (let a = 0; a < idxs.length; a++) {
|
|
114
|
+
for (let b = a + 1; b < idxs.length; b++) {
|
|
115
|
+
const ia = idxs[a];
|
|
116
|
+
const ib = idxs[b];
|
|
117
|
+
union(ia, ib);
|
|
118
|
+
pairs.set(pairKey(ia, ib), {
|
|
119
|
+
similarity: 1.0,
|
|
120
|
+
reasons: ["exact normalized-content match"],
|
|
121
|
+
exact: true,
|
|
122
|
+
hasDiff: false,
|
|
123
|
+
promoted: false,
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
// ----- L3 + L2: Cosine similarity with heading promoter -----
|
|
129
|
+
for (let i = 0; i < n; i++) {
|
|
130
|
+
for (let j = i + 1; j < n; j++) {
|
|
131
|
+
// L3: skip same-file pairs.
|
|
132
|
+
if (sections[i].file_path === sections[j].file_path) {
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
const score = dotProduct(vecs[i], vecs[j]);
|
|
136
|
+
if (score < acfg.thresh_maybe) {
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
const k = pairKey(i, j);
|
|
140
|
+
if (pairs.has(k)) {
|
|
141
|
+
// Already set by L1 — don't overwrite.
|
|
142
|
+
continue;
|
|
143
|
+
}
|
|
144
|
+
// Common distinctive heading tokens.
|
|
145
|
+
const commonDist = intersectSets(secDistTokens[i], secDistTokens[j]);
|
|
146
|
+
const headingSignal = commonDist.size > 0;
|
|
147
|
+
let promoted = false;
|
|
148
|
+
const reasons = [];
|
|
149
|
+
if (score >= acfg.thresh_high) {
|
|
150
|
+
reasons.push(`high semantic similarity (cosine=${fmt3(score)})`);
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
reasons.push(`possible semantic similarity (cosine=${fmt3(score)})`);
|
|
154
|
+
}
|
|
155
|
+
if (headingSignal) {
|
|
156
|
+
const tokens = sortedKeys(commonDist);
|
|
157
|
+
reasons.push("shared distinctive heading tokens: " + tokens.join(", "));
|
|
158
|
+
}
|
|
159
|
+
// L4: differentiator check.
|
|
160
|
+
const diffs = findDifferentiators(sections[i].raw_content, sections[j].raw_content, acfg.differentiators);
|
|
161
|
+
for (const d of diffs) {
|
|
162
|
+
reasons.push(d);
|
|
163
|
+
}
|
|
164
|
+
// L2 promotion: MAYBE + distinctive heading match -> HIGH.
|
|
165
|
+
if (score < acfg.thresh_high && headingSignal) {
|
|
166
|
+
promoted = true;
|
|
167
|
+
}
|
|
168
|
+
const tierHigh = score >= acfg.thresh_high || promoted;
|
|
169
|
+
pairs.set(k, {
|
|
170
|
+
similarity: score,
|
|
171
|
+
reasons,
|
|
172
|
+
exact: false,
|
|
173
|
+
hasDiff: diffs.length > 0,
|
|
174
|
+
promoted,
|
|
175
|
+
});
|
|
176
|
+
if (tierHigh) {
|
|
177
|
+
union(i, j);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
// ----- Assemble groups from union-find -----
|
|
182
|
+
// Iterate node indices 0..n-1 so each root's member list is built in
|
|
183
|
+
// ascending node order, matching the deterministic shape needed downstream.
|
|
184
|
+
const groupsIdx = new Map();
|
|
185
|
+
for (let i = 0; i < n; i++) {
|
|
186
|
+
const root = find(i);
|
|
187
|
+
const list = groupsIdx.get(root);
|
|
188
|
+
if (list) {
|
|
189
|
+
list.push(i);
|
|
190
|
+
}
|
|
191
|
+
else {
|
|
192
|
+
groupsIdx.set(root, [i]);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
// groupsIdx is keyed by root; iterate roots in ascending order for a
|
|
196
|
+
// deterministic baseline before sortGroups runs.
|
|
197
|
+
const roots = [...groupsIdx.keys()].sort((a, b) => a - b);
|
|
198
|
+
let groups = [];
|
|
199
|
+
for (const root of roots) {
|
|
200
|
+
const idxs = groupsIdx.get(root);
|
|
201
|
+
if (idxs.length > 1) {
|
|
202
|
+
groups.push(idxs);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
// Drop groups where every member is disqualified.
|
|
206
|
+
groups = filterDisqualifiedGroups(groups, sections, acfg);
|
|
207
|
+
// Build node -> group index map for MAYBE pair filtering.
|
|
208
|
+
const nodeToGroup = new Map();
|
|
209
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
210
|
+
for (const k of groups[gi]) {
|
|
211
|
+
nodeToGroup.set(k, gi);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
// ----- Possible pairs: MAYBE not unioned, not all-disqualified -----
|
|
215
|
+
// Iterate pairs in sorted (i,j) key order so the pre-sort baseline is
|
|
216
|
+
// deterministic (Go iterates a map, then sorts non-stably by similarity).
|
|
217
|
+
const possiblePairs = [];
|
|
218
|
+
for (const k of sortedPairKeys(pairs)) {
|
|
219
|
+
const info = pairs.get(k);
|
|
220
|
+
if (info.exact) {
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
if (info.similarity >= acfg.thresh_high || info.promoted) {
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
const [ki, kj] = parsePairKey(k);
|
|
227
|
+
// Skip if both nodes are in the same group.
|
|
228
|
+
const gi = nodeToGroup.get(ki);
|
|
229
|
+
const gj = nodeToGroup.get(kj);
|
|
230
|
+
if (gi !== undefined && gj !== undefined && gi === gj) {
|
|
231
|
+
continue;
|
|
232
|
+
}
|
|
233
|
+
// Both sides disqualified -> drop.
|
|
234
|
+
if (isDisqualified(sections[ki], acfg) &&
|
|
235
|
+
isDisqualified(sections[kj], acfg)) {
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
238
|
+
possiblePairs.push({ i: ki, j: kj, info });
|
|
239
|
+
}
|
|
240
|
+
// Sort by descending similarity.
|
|
241
|
+
sortBySimDesc(possiblePairs);
|
|
242
|
+
// ----- Sort groups by (-size, -best_similarity) -----
|
|
243
|
+
sortGroups(groups, pairs);
|
|
244
|
+
// ----- Build Report -----
|
|
245
|
+
const rep = { HighGroups: [], MaybePairs: [], PartialOverlaps: [] };
|
|
246
|
+
for (const idxs of groups) {
|
|
247
|
+
rep.HighGroups.push(buildGroup(idxs, sections, pairs, acfg, rcfg));
|
|
248
|
+
}
|
|
249
|
+
for (const pp of possiblePairs) {
|
|
250
|
+
rep.MaybePairs.push(buildPair(pp.i, pp.j, pp.info, sections, acfg, rcfg));
|
|
251
|
+
}
|
|
252
|
+
// ----- L5: partial-overlap block-level detection -----
|
|
253
|
+
if (blocks && blocks.length > 0) {
|
|
254
|
+
// Build secGroup: section_id -> HIGH group index, from the final groups
|
|
255
|
+
// slice. Only multi-member HIGH groups populate it.
|
|
256
|
+
const secGroup = new Map();
|
|
257
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
258
|
+
for (const k of groups[gi]) {
|
|
259
|
+
secGroup.set(sections[k].id, gi);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
rep.PartialOverlaps = partialOverlaps(blocks, blockEmb ?? new Map(), secGroup, cfg);
|
|
263
|
+
}
|
|
264
|
+
return rep;
|
|
265
|
+
}
|
|
266
|
+
/** buildGroup constructs a Group from a set of section indices. */
|
|
267
|
+
function buildGroup(idxs, sections, pairs, acfg, rcfg) {
|
|
268
|
+
// Sort by rank to identify canonical.
|
|
269
|
+
const sorted = idxs.slice();
|
|
270
|
+
sortByRank(sorted, sections, acfg);
|
|
271
|
+
const canonicalIdx = sorted[0];
|
|
272
|
+
const dupIdxs = sorted.slice(1);
|
|
273
|
+
// Gather all internal pairs.
|
|
274
|
+
const internalKeys = [];
|
|
275
|
+
for (let a = 0; a < sorted.length; a++) {
|
|
276
|
+
for (let b = a + 1; b < sorted.length; b++) {
|
|
277
|
+
const k = pairKey(sorted[a], sorted[b]);
|
|
278
|
+
if (pairs.has(k)) {
|
|
279
|
+
internalKeys.push(k);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
// Compute confidence and action.
|
|
284
|
+
let minSim = 1.0;
|
|
285
|
+
let hasDiff = false;
|
|
286
|
+
let allHighOrPromoted = true;
|
|
287
|
+
for (const k of internalKeys) {
|
|
288
|
+
const info = pairs.get(k);
|
|
289
|
+
if (info.similarity < minSim) {
|
|
290
|
+
minSim = info.similarity;
|
|
291
|
+
}
|
|
292
|
+
if (info.hasDiff) {
|
|
293
|
+
hasDiff = true;
|
|
294
|
+
}
|
|
295
|
+
if (!info.exact && info.similarity < acfg.thresh_high && !info.promoted) {
|
|
296
|
+
allHighOrPromoted = false;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
// First decide tier from internal pair quality (ignore differentiator).
|
|
300
|
+
let confidence;
|
|
301
|
+
let action;
|
|
302
|
+
if (allHighOrPromoted) {
|
|
303
|
+
confidence = "high";
|
|
304
|
+
action = "replace_with_reference";
|
|
305
|
+
}
|
|
306
|
+
else if (minSim >= acfg.thresh_maybe) {
|
|
307
|
+
confidence = "medium";
|
|
308
|
+
action = "manual_review";
|
|
309
|
+
}
|
|
310
|
+
else {
|
|
311
|
+
confidence = "low";
|
|
312
|
+
action = "manual_review";
|
|
313
|
+
}
|
|
314
|
+
// L4 downgrade: differentiator forces manual_review but never changes tier.
|
|
315
|
+
if (hasDiff && action === "replace_with_reference") {
|
|
316
|
+
action = "manual_review";
|
|
317
|
+
}
|
|
318
|
+
// Build canonical member.
|
|
319
|
+
const canonical = sectionToMember(sections[canonicalIdx], 1.0, null, false, rcfg);
|
|
320
|
+
// Build duplicate members.
|
|
321
|
+
const members = [];
|
|
322
|
+
for (const di of dupIdxs) {
|
|
323
|
+
const k = pairKey(canonicalIdx, di);
|
|
324
|
+
let info = pairs.get(k);
|
|
325
|
+
if (info === undefined) {
|
|
326
|
+
// Transitive group member — find best pair involving this section.
|
|
327
|
+
info = findBestPairInfo(di, sorted.slice(0, indexOf(sorted, di)), pairs);
|
|
328
|
+
}
|
|
329
|
+
members.push(sectionToMember(sections[di], info.similarity, info.reasons, info.exact, rcfg));
|
|
330
|
+
}
|
|
331
|
+
return { Canonical: canonical, Members: members, Confidence: confidence, Action: action };
|
|
332
|
+
}
|
|
333
|
+
/** buildPair constructs a Pair for a possible pair. */
|
|
334
|
+
function buildPair(i, j, info, sections, acfg, rcfg) {
|
|
335
|
+
// Pick canonical by rank.
|
|
336
|
+
let canonIdx;
|
|
337
|
+
let candIdx;
|
|
338
|
+
const ri = canonicalRank(sections[i], acfg);
|
|
339
|
+
const rj = canonicalRank(sections[j], acfg);
|
|
340
|
+
if (lessRank(ri, rj)) {
|
|
341
|
+
canonIdx = i;
|
|
342
|
+
candIdx = j;
|
|
343
|
+
}
|
|
344
|
+
else {
|
|
345
|
+
canonIdx = j;
|
|
346
|
+
candIdx = i;
|
|
347
|
+
}
|
|
348
|
+
const canonical = sectionToMember(sections[canonIdx], 1.0, null, false, rcfg);
|
|
349
|
+
const candidate = sectionToMember(sections[candIdx], info.similarity, info.reasons, false, rcfg);
|
|
350
|
+
return {
|
|
351
|
+
Canonical: canonical,
|
|
352
|
+
Candidate: candidate,
|
|
353
|
+
Similarity: info.similarity,
|
|
354
|
+
Reasons: info.reasons,
|
|
355
|
+
};
|
|
356
|
+
}
|
|
357
|
+
/** sectionToMember converts a Section to a Member. */
|
|
358
|
+
function sectionToMember(s, sim, reasons, exact, rcfg) {
|
|
359
|
+
return {
|
|
360
|
+
SectionID: s.id,
|
|
361
|
+
FilePath: s.file_path,
|
|
362
|
+
Anchor: s.anchor,
|
|
363
|
+
Heading: s.heading,
|
|
364
|
+
StartLine: s.start_line,
|
|
365
|
+
EndLine: s.end_line,
|
|
366
|
+
InboundCount: s.inbound_count,
|
|
367
|
+
Similarity: sim,
|
|
368
|
+
// Go passes nil reasons for the canonical; an empty array is len-0 and
|
|
369
|
+
// iterates identically, satisfying the string[] type (the report renderer
|
|
370
|
+
// only ranges over Reasons, so nil vs [] is indistinguishable downstream).
|
|
371
|
+
Reasons: reasons ?? [],
|
|
372
|
+
ExactMatch: exact,
|
|
373
|
+
Preview: computePreview(s.raw_content, rcfg.preview_chars, rcfg.preview_word_ratio),
|
|
374
|
+
};
|
|
375
|
+
}
|
|
376
|
+
// ---------- helpers ----------
|
|
377
|
+
/**
|
|
378
|
+
* dotProduct computes the dot product of two float32-sourced vectors as float64.
|
|
379
|
+
* For L2-normalized vectors, this equals the cosine similarity. Mirrors Go's
|
|
380
|
+
* dotProduct: zero when either side is missing/short.
|
|
381
|
+
*/
|
|
382
|
+
function dotProduct(a, b) {
|
|
383
|
+
if (a === undefined || b === undefined) {
|
|
384
|
+
return 0;
|
|
385
|
+
}
|
|
386
|
+
const n = a.length;
|
|
387
|
+
if (n === 0 || b.length < n) {
|
|
388
|
+
return 0;
|
|
389
|
+
}
|
|
390
|
+
let sum = 0;
|
|
391
|
+
for (let i = 0; i < n; i++) {
|
|
392
|
+
sum += a[i] * b[i];
|
|
393
|
+
}
|
|
394
|
+
return sum;
|
|
395
|
+
}
|
|
396
|
+
/** intersectSets returns the intersection of two string sets. */
|
|
397
|
+
function intersectSets(a, b) {
|
|
398
|
+
const result = new Set();
|
|
399
|
+
for (const k of a) {
|
|
400
|
+
if (b.has(k)) {
|
|
401
|
+
result.add(k);
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
return result;
|
|
405
|
+
}
|
|
406
|
+
/** sortedKeys returns the keys of a string set, sorted lexicographically. */
|
|
407
|
+
function sortedKeys(m) {
|
|
408
|
+
const keys = [...m];
|
|
409
|
+
keys.sort(cmpStr);
|
|
410
|
+
return keys;
|
|
411
|
+
}
|
|
412
|
+
/** sortedPairKeys returns the pairs map's keys sorted by (i, j) numerically. */
|
|
413
|
+
function sortedPairKeys(pairs) {
|
|
414
|
+
return [...pairs.keys()].sort((a, b) => {
|
|
415
|
+
const [ai, aj] = parsePairKey(a);
|
|
416
|
+
const [bi, bj] = parsePairKey(b);
|
|
417
|
+
if (ai !== bi)
|
|
418
|
+
return ai - bi;
|
|
419
|
+
return aj - bj;
|
|
420
|
+
});
|
|
421
|
+
}
|
|
422
|
+
/** parsePairKey splits an "i,j" key back into numeric [i, j]. */
|
|
423
|
+
function parsePairKey(k) {
|
|
424
|
+
const c = k.indexOf(",");
|
|
425
|
+
return [Number(k.slice(0, c)), Number(k.slice(c + 1))];
|
|
426
|
+
}
|
|
427
|
+
/** cmpStr reproduces Go's cmp.Compare for strings: -1 / 0 / +1. */
|
|
428
|
+
function cmpStr(a, b) {
|
|
429
|
+
if (a < b)
|
|
430
|
+
return -1;
|
|
431
|
+
if (a > b)
|
|
432
|
+
return 1;
|
|
433
|
+
return 0;
|
|
434
|
+
}
|
|
435
|
+
/**
|
|
436
|
+
* fmt3 formats a number with exactly 3 decimal places, matching Go's "%.3f".
|
|
437
|
+
* Go uses round-half-to-even; JS toFixed rounds half away from zero. The
|
|
438
|
+
* cosine inputs here are not exact .xxxx5 ties at the 4th decimal in practice,
|
|
439
|
+
* so the two agree on the rendered reason strings the tests cover.
|
|
440
|
+
*/
|
|
441
|
+
function fmt3(x) {
|
|
442
|
+
return x.toFixed(3);
|
|
443
|
+
}
|
|
444
|
+
/** sortBySimDesc sorts possible pairs by descending similarity (in place). */
|
|
445
|
+
function sortBySimDesc(pp) {
|
|
446
|
+
pp.sort((a, b) => cmpNum(b.info.similarity, a.info.similarity));
|
|
447
|
+
}
|
|
448
|
+
/** sortGroups sorts groups by (-size, -best_similarity) (in place). */
|
|
449
|
+
function sortGroups(groups, pairs) {
|
|
450
|
+
groups.sort((a, b) => cmpNum(groupScore(a, pairs), groupScore(b, pairs)));
|
|
451
|
+
}
|
|
452
|
+
/**
|
|
453
|
+
* groupScore returns (-size, -best_sim) encoded as a single float for ordering.
|
|
454
|
+
* Lower score sorts first (larger groups, then higher best similarity).
|
|
455
|
+
*/
|
|
456
|
+
function groupScore(idxs, pairs) {
|
|
457
|
+
let best = 0.0;
|
|
458
|
+
for (let a = 0; a < idxs.length; a++) {
|
|
459
|
+
for (let b = a + 1; b < idxs.length; b++) {
|
|
460
|
+
const info = pairs.get(pairKey(idxs[a], idxs[b]));
|
|
461
|
+
if (info !== undefined && info.similarity > best) {
|
|
462
|
+
best = info.similarity;
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
return -idxs.length * 1e9 + -best;
|
|
467
|
+
}
|
|
468
|
+
/** sortByRank sorts section indices by canonical rank (in place). */
|
|
469
|
+
function sortByRank(idxs, sections, acfg) {
|
|
470
|
+
idxs.sort((a, b) => {
|
|
471
|
+
const ra = canonicalRank(sections[a], acfg);
|
|
472
|
+
const rb = canonicalRank(sections[b], acfg);
|
|
473
|
+
if (lessRank(ra, rb))
|
|
474
|
+
return -1;
|
|
475
|
+
if (lessRank(rb, ra))
|
|
476
|
+
return 1;
|
|
477
|
+
return 0;
|
|
478
|
+
});
|
|
479
|
+
}
|
|
480
|
+
/** filterDisqualifiedGroups drops groups where every member is disqualified. */
|
|
481
|
+
function filterDisqualifiedGroups(groups, sections, acfg) {
|
|
482
|
+
const result = [];
|
|
483
|
+
for (const idxs of groups) {
|
|
484
|
+
let allDisq = true;
|
|
485
|
+
for (const k of idxs) {
|
|
486
|
+
if (!isDisqualified(sections[k], acfg)) {
|
|
487
|
+
allDisq = false;
|
|
488
|
+
break;
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
if (!allDisq) {
|
|
492
|
+
result.push(idxs);
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
return result;
|
|
496
|
+
}
|
|
497
|
+
/**
|
|
498
|
+
* findBestPairInfo returns the highest-similarity recorded pair between target
|
|
499
|
+
* and any of prevIdxs, defaulting to a transitive-member placeholder.
|
|
500
|
+
*/
|
|
501
|
+
function findBestPairInfo(target, prevIdxs, pairs) {
|
|
502
|
+
let best = {
|
|
503
|
+
similarity: 0.0,
|
|
504
|
+
reasons: ["transitive group member"],
|
|
505
|
+
exact: false,
|
|
506
|
+
hasDiff: false,
|
|
507
|
+
promoted: false,
|
|
508
|
+
};
|
|
509
|
+
for (const prev of prevIdxs) {
|
|
510
|
+
const info = pairs.get(pairKey(prev, target));
|
|
511
|
+
if (info !== undefined && info.similarity > best.similarity) {
|
|
512
|
+
best = info;
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
return best;
|
|
516
|
+
}
|
|
517
|
+
/** indexOf returns the position of target in idxs, or idxs.length if absent. */
|
|
518
|
+
function indexOf(idxs, target) {
|
|
519
|
+
for (let i = 0; i < idxs.length; i++) {
|
|
520
|
+
if (idxs[i] === target) {
|
|
521
|
+
return i;
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
return idxs.length;
|
|
525
|
+
}
|
|
526
|
+
/** cmpNum reproduces Go's cmp.Compare for numbers: -1 / 0 / +1. */
|
|
527
|
+
function cmpNum(a, b) {
|
|
528
|
+
if (a < b)
|
|
529
|
+
return -1;
|
|
530
|
+
if (a > b)
|
|
531
|
+
return 1;
|
|
532
|
+
return 0;
|
|
533
|
+
}
|