npm - docsgov - Versions diffs - 0.1.0 - Mend

docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (159) hide show

package/README.md +242 -0
package/dist/apispec/apispec.js +401 -0
package/dist/apispec/apispec.test.js +444 -0
package/dist/apispec/errors.js +17 -0
package/dist/apispec/index.js +2 -0
package/dist/check/doclinks.js +167 -0
package/dist/check/index.js +8 -0
package/dist/check/run.js +391 -0
package/dist/check/run.test.js +513 -0
package/dist/check/suggest.js +134 -0
package/dist/check/suggest.test.js +92 -0
package/dist/check/tokens.js +125 -0
package/dist/cmd/main.js +330 -0
package/dist/cmd/main.test.js +422 -0
package/dist/codeq/cache.js +71 -0
package/dist/codeq/cache.test.js +67 -0
package/dist/codeq/errors.js +52 -0
package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
package/dist/codeq/index.js +11 -0
package/dist/codeq/resolve.test.js +109 -0
package/dist/codeq/resolver.js +128 -0
package/dist/codeq/resolver.test.js +124 -0
package/dist/codeq/resolvers/go.js +242 -0
package/dist/codeq/resolvers/go.test.js +143 -0
package/dist/codeq/resolvers/java.js +349 -0
package/dist/codeq/resolvers/java.test.js +138 -0
package/dist/codeq/resolvers/java_queries.js +63 -0
package/dist/codeq/resolvers/javascript.js +412 -0
package/dist/codeq/resolvers/javascript.test.js +125 -0
package/dist/codeq/resolvers/javascript_queries.js +46 -0
package/dist/codeq/resolvers/typescript.js +366 -0
package/dist/codeq/resolvers/typescript.test.js +180 -0
package/dist/codeq/resolvers/typescript_queries.js +78 -0
package/dist/codeq/signature.js +50 -0
package/dist/codeq/signature.test.js +50 -0
package/dist/codeq/suggest.js +96 -0
package/dist/codeq/treesitter.js +122 -0
package/dist/codeq/treesitter.test.js +118 -0
package/dist/config/config.js +74 -0
package/dist/config/config.test.js +98 -0
package/dist/config/fs.js +116 -0
package/dist/config/glob.js +82 -0
package/dist/config/glob.test.js +61 -0
package/dist/config/index.js +4 -0
package/dist/dedup/analyzer/analyzer.js +533 -0
package/dist/dedup/analyzer/analyzer.test.js +530 -0
package/dist/dedup/analyzer/canonical.js +74 -0
package/dist/dedup/analyzer/canonical.test.js +70 -0
package/dist/dedup/analyzer/cosine_clusters.js +169 -0
package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
package/dist/dedup/analyzer/distinctive.js +85 -0
package/dist/dedup/analyzer/distinctive.test.js +49 -0
package/dist/dedup/analyzer/exact_clusters.js +63 -0
package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
package/dist/dedup/analyzer/index.js +14 -0
package/dist/dedup/analyzer/multiplicity.js +110 -0
package/dist/dedup/analyzer/multiplicity.test.js +123 -0
package/dist/dedup/analyzer/order.js +22 -0
package/dist/dedup/analyzer/partial_overlaps.js +65 -0
package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
package/dist/dedup/analyzer/preview.js +84 -0
package/dist/dedup/analyzer/preview.test.js +46 -0
package/dist/dedup/analyzer/safety.js +27 -0
package/dist/dedup/analyzer/safety.test.js +39 -0
package/dist/dedup/config.js +18 -0
package/dist/dedup/configload.js +299 -0
package/dist/dedup/configload.test.js +410 -0
package/dist/dedup/dedup.index.test.js +203 -0
package/dist/dedup/dedup.js +143 -0
package/dist/dedup/dedup.test.js +212 -0
package/dist/dedup/dedupcfg/config.js +112 -0
package/dist/dedup/dedupcfg/config.test.js +70 -0
package/dist/dedup/dedupcfg/index.js +1 -0
package/dist/dedup/deduptypes/index.js +1 -0
package/dist/dedup/deduptypes/types.js +9 -0
package/dist/dedup/deduptypes/types.test.js +34 -0
package/dist/dedup/embedder/cache.js +23 -0
package/dist/dedup/embedder/cache.test.js +50 -0
package/dist/dedup/embedder/constants.js +10 -0
package/dist/dedup/embedder/embedder.js +76 -0
package/dist/dedup/embedder/embedder.mock.test.js +128 -0
package/dist/dedup/embedder/embedder.test.js +96 -0
package/dist/dedup/embedder/errors.js +20 -0
package/dist/dedup/embedder/errors.test.js +35 -0
package/dist/dedup/embedder/index.js +4 -0
package/dist/dedup/embedder/session.js +78 -0
package/dist/dedup/embedder/session.test.js +172 -0
package/dist/dedup/gitignore.js +97 -0
package/dist/dedup/gitignore.test.js +98 -0
package/dist/dedup/index.js +11 -0
package/dist/dedup/indexdb/errors.js +48 -0
package/dist/dedup/indexdb/index.js +6 -0
package/dist/dedup/indexdb/indexdb.js +302 -0
package/dist/dedup/indexdb/indexdb.test.js +739 -0
package/dist/dedup/indexdb/load.js +110 -0
package/dist/dedup/indexdb/migrations.js +58 -0
package/dist/dedup/indexdb/schema.js +83 -0
package/dist/dedup/indexer/index.js +9 -0
package/dist/dedup/indexer/indexer.js +501 -0
package/dist/dedup/indexer/indexer.test.js +510 -0
package/dist/dedup/indexer/links.js +89 -0
package/dist/dedup/mdsection/anchor.js +60 -0
package/dist/dedup/mdsection/anchor.test.js +39 -0
package/dist/dedup/mdsection/blocks.js +409 -0
package/dist/dedup/mdsection/blocks.test.js +359 -0
package/dist/dedup/mdsection/index.js +4 -0
package/dist/dedup/mdsection/parse.js +21 -0
package/dist/dedup/mdsection/section.js +234 -0
package/dist/dedup/mdsection/section.test.js +221 -0
package/dist/dedup/report/floatfmt.js +71 -0
package/dist/dedup/report/floatfmt.test.js +42 -0
package/dist/dedup/report/index.js +8 -0
package/dist/dedup/report/quote.js +77 -0
package/dist/dedup/report/quote.test.js +67 -0
package/dist/dedup/report/text.js +251 -0
package/dist/dedup/report/text.test.js +420 -0
package/dist/dedup/report_types.js +8 -0
package/dist/dedup/sectionid/index.js +1 -0
package/dist/dedup/sectionid/sectionid.js +16 -0
package/dist/dedup/sectionid/sectionid.test.js +49 -0
package/dist/guard/api/errors.js +12 -0
package/dist/guard/api/index.js +2 -0
package/dist/guard/api/parser.js +81 -0
package/dist/guard/api/parser.test.js +58 -0
package/dist/guard/api/types.js +1 -0
package/dist/guard/code/errors.js +16 -0
package/dist/guard/code/index.js +2 -0
package/dist/guard/code/parser.js +54 -0
package/dist/guard/code/parser.test.js +111 -0
package/dist/guard/code/types.js +6 -0
package/dist/index.js +1 -0
package/dist/index.test.js +5 -0
package/dist/repo/boundary.js +92 -0
package/dist/repo/boundary.test.js +65 -0
package/dist/repo/errors.js +56 -0
package/dist/repo/errors.test.js +85 -0
package/dist/repo/exists.test.js +72 -0
package/dist/repo/filename.js +46 -0
package/dist/repo/filename.test.js +39 -0
package/dist/repo/fs.js +53 -0
package/dist/repo/index.js +7 -0
package/dist/repo/overlay.js +36 -0
package/dist/repo/overlay.test.js +80 -0
package/dist/repo/repo.js +353 -0
package/dist/repo/repo.test.js +255 -0
package/dist/repo/testutil.js +27 -0
package/dist/repo/write.test.js +125 -0
package/dist/report/color.js +73 -0
package/dist/report/index.js +1 -0
package/dist/report/report.js +112 -0
package/dist/report/report.test.js +368 -0
package/dist/violation/index.js +1 -0
package/dist/violation/types.js +22 -0
package/dist/violation/types.test.js +70 -0
package/package.json +48 -0

package/dist/dedup/configload.test.js ADDED Viewed

@@ -0,0 +1,410 @@
+/**
+ * Behavior-encoding tests for the dedup config facade, ported from
+ * internal/dedup/config_test.go (locked defaults) and
+ * internal/dedup/configload_test.go (overlay + validation).
+ *
+ * WHY: the locked defaults are a parity contract against the Go binary and the
+ * Python POC — any drift silently changes every analysis. The overlay rules
+ * (scalars override, lists replace, *_extra appends) and the validation ranges
+ * are the user-facing config behavior; a regression here either ignores a user's
+ * config.yml or admits an out-of-range value that misbehaves deep in the
+ * analyzer. Type-mismatch must throw a matchable error (Go's *yaml.TypeError →
+ * YAMLTypeError) so the CLI can distinguish "your YAML is wrong" from a bug.
+ */
+import { mkdtempSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, describe, expect, it } from "vitest";
+import { Default } from "./config.js";
+import { Load, YAMLTypeError } from "./configload.js";
+const tmpDirs = [];
+afterEach(() => {
+    for (const d of tmpDirs.splice(0)) {
+        rmSync(d, { recursive: true, force: true });
+    }
+});
+/** newRepo returns a fresh temp repo root with no .docgov/dedup. */
+function newRepo() {
+    const dir = mkdtempSync(join(tmpdir(), "dedup-cfg-"));
+    tmpDirs.push(dir);
+    return dir;
+}
+/** writeConfigYML writes config.yml under repoRoot/.docgov/dedup. */
+function writeConfigYML(repoRoot, yaml) {
+    const dedupDir = join(repoRoot, ".docgov", "dedup");
+    mkdirSync(dedupDir, { recursive: true });
+    writeFileSync(join(dedupDir, "config.yml"), yaml);
+}
+describe("Default", () => {
+    // WHY: these scalars are the locked v1 tuning; drift changes every run.
+    it("has the locked scalar values", () => {
+        const cfg = Default();
+        expect(cfg.Markdown.min_prose_words).toBe(10);
+        expect(cfg.Markdown.heading_token_min_len).toBe(3);
+        expect(cfg.Markdown.hidden_dir_prefix).toBe(".");
+        expect(cfg.Indexer.embed_progress_threshold).toBe(100);
+        expect(cfg.Analyzer.thresh_high).toBe(0.93);
+        expect(cfg.Analyzer.thresh_maybe).toBe(0.86);
+        expect(cfg.Analyzer.distinctive_abs_min).toBe(3);
+        expect(cfg.Analyzer.distinctive_pct_of_headings).toBe(0.03);
+        expect(cfg.Report.preview_chars).toBe(280);
+        expect(cfg.Report.preview_word_ratio).toBe(0.6);
+        expect(cfg.Report.wrap_cols).toBe(72);
+        expect(cfg.Report.separator).toBe("---");
+        expect(cfg.Embedder.batch_size).toBe(32);
+    });
+    // WHY: the ignored-dirs / stopword / differentiator / path lists are locked
+    // and load-bearing — the indexer skips dirs and the analyzer's safety net
+    // depends on these exact entries and order.
+    it("has the locked list values", () => {
+        const cfg = Default();
+        expect(cfg.Markdown.ignored_dirs).toEqual([
+            ".git", "node_modules", "vendor", "dist", "build",
+            ".next", ".cache", ".docgov", "dedup-poc", ".venv",
+        ]);
+        expect(cfg.Analyzer.universal_stopwords).toEqual([
+            "the", "a", "an", "of", "and", "or", "to", "with",
+            "for", "in", "on", "is", "are", "be", "by", "from", "as", "at",
+        ]);
+        expect(cfg.Analyzer.differentiators).toEqual([
+            ["calendar days", "business days"],
+            ["sync", "async"],
+            ["create", "cancel"],
+            ["source", "target"],
+            ["inbound", "outbound"],
+            ["success", "failure"],
+            ["old flow", "new flow"],
+            ["deprecated", "current"],
+        ]);
+        expect(cfg.Analyzer.path_priority).toEqual([
+            "docs/concepts/", "docs/architecture/", "docs/design/",
+        ]);
+        expect(cfg.Analyzer.heading_blacklist).toEqual([
+            "related", "template rendering", "template sample",
+        ]);
+        expect(cfg.Analyzer.path_blacklist).toEqual([
+            "changelog", "migration", "deprecated", "old", "legacy", "temporary",
+        ]);
+        expect(cfg.Indexer.external_url_prefixes).toEqual([
+            "http://", "https://", "mailto:",
+        ]);
+    });
+});
+describe("Load", () => {
+    // WHY: no config.yml must yield the locked defaults, not an error or partial.
+    it("returns Default() when no YAML exists", async () => {
+        const cfg = await Load(newRepo());
+        const want = Default();
+        expect(cfg.Analyzer.thresh_high).toBe(want.Analyzer.thresh_high);
+        expect(cfg.Markdown.min_prose_words).toBe(want.Markdown.min_prose_words);
+    });
+    // WHY: a single overridden scalar must change ONLY that field; everything else
+    // must stay at default (overlay, not replace-whole-config).
+    it("overlays a single scalar and leaves others at default", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "analyzer:\n  thresh_high: 0.95\n");
+        const cfg = await Load(repo);
+        expect(cfg.Analyzer.thresh_high).toBe(0.95);
+        const want = Default();
+        expect(cfg.Analyzer.thresh_maybe).toBe(want.Analyzer.thresh_maybe);
+        expect(cfg.Markdown.min_prose_words).toBe(want.Markdown.min_prose_words);
+    });
+    // WHY: a list set in config.yml REPLACES the default list entirely (it does
+    // not merge) — the user is opting out of the defaults for that field.
+    it("replaces a list field entirely when set", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, 'analyzer:\n  heading_blacklist: ["custom"]\n');
+        const cfg = await Load(repo);
+        expect(cfg.Analyzer.heading_blacklist).toEqual(["custom"]);
+    });
+    // WHY: *_extra fields APPEND to the (possibly replaced) base list — the user
+    // adds to the defaults without restating them.
+    it("appends *_extra entries to the base list", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, 'markdown:\n  ignored_dirs_extra: ["mydir"]\n');
+        const cfg = await Load(repo);
+        const defaults = Default();
+        expect(cfg.Markdown.ignored_dirs).toHaveLength(defaults.Markdown.ignored_dirs.length + 1);
+        expect(cfg.Markdown.ignored_dirs.at(-1)).toBe("mydir");
+    });
+    // WHY: malformed YAML must be a hard error — never a silent fallback to
+    // defaults that would mask the user's broken config.
+    it("throws on malformed YAML (tab indentation)", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "analyzer:\n\tthresh_high: 0.95\n");
+        await expect(Load(repo)).rejects.toThrow();
+    });
+    // WHY: a type mismatch must throw a MATCHABLE error (Go's *yaml.TypeError →
+    // instanceof YAMLTypeError) so the CLI can report "your config.yml has the
+    // wrong type" distinctly from an internal bug. A regression to a plain throw
+    // (or silent coercion) breaks that distinction.
+    it("throws YAMLTypeError on a type mismatch", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, 'analyzer:\n  thresh_high: "not a number"\n');
+        await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
+    });
+    // WHY: validation must reject thresh_high >= 1 (the cosine cap can never be 1).
+    it("rejects thresh_high >= 1", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "analyzer:\n  thresh_high: 1.0\n");
+        await expect(Load(repo)).rejects.toThrow();
+    });
+    // WHY: thresh_maybe must stay strictly below thresh_high or the MAYBE band
+    // collapses.
+    it("rejects thresh_maybe >= thresh_high", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "analyzer:\n  thresh_maybe: 0.97\n  thresh_high: 0.95\n");
+        await expect(Load(repo)).rejects.toThrow();
+    });
+    // WHY: validation runs AFTER the overlay is applied (not on defaults) so a
+    // valid-syntax overlay that produces an invalid value is still caught.
+    it("validates after overlay: rejects min_prose_words = 0", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "markdown:\n  min_prose_words: 0\n");
+        await expect(Load(repo)).rejects.toThrow();
+    });
+    // WHY: same — an overlaid batch_size of 0 must fail at load, not later.
+    it("validates after overlay: rejects batch_size = 0", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "embedder:\n  batch_size: 0\n");
+        await expect(Load(repo)).rejects.toThrow();
+    });
+    // WHY: each block.* knob became user-overridable; an out-of-range value (a
+    // cosine_threshold that can never match, or a zero min_words/cap) must fail
+    // loudly at load rather than silently misbehaving inside indexer/analyzer.
+    it.each([
+        ["min_words = 0", "block:\n  min_words: 0\n"],
+        ["table_min_rows = 0", "block:\n  table_min_rows: 0\n"],
+        ["cosine_threshold >= 1", "block:\n  cosine_threshold: 1.0\n"],
+        ["cosine_threshold <= 0", "block:\n  cosine_threshold: 0\n"],
+        ["multiplicity_cap = 0", "block:\n  multiplicity_cap: 0\n"],
+    ])("rejects bad block config: %s", async (_name, yaml) => {
+        const repo = newRepo();
+        writeConfigYML(repo, yaml);
+        await expect(Load(repo)).rejects.toThrow();
+    });
+    // WHY: an empty / null document is a valid file that means "no overrides"; it
+    // must yield defaults, not error — a user may leave a placeholder config.yml.
+    it("treats an empty/null document as a no-op overlay (defaults)", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "");
+        const cfg = await Load(repo);
+        expect(cfg.Analyzer.thresh_high).toBe(Default().Analyzer.thresh_high);
+    });
+    // WHY: the top-level document must be a mapping; a scalar or sequence at the
+    // root is a structural mistake the CLI must report as a YAML type error, not
+    // silently ignore.
+    it("throws YAMLTypeError when the root document is not a mapping", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "- a\n- b\n"); // a sequence, not a mapping
+        await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
+    });
+    // WHY: a section key (e.g. markdown:) whose value is a scalar instead of a
+    // sub-mapping is malformed; it must throw a matchable type error rather than
+    // be coerced or skipped.
+    it("throws YAMLTypeError when a section is not a mapping", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "markdown: 5\n");
+        await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
+    });
+    // WHY: a present-but-empty section (markdown: with no children) is a no-op
+    // overlay; it must leave every field at default rather than wipe the section.
+    it("treats a present-but-empty section as no overlay", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "markdown:\n");
+        const cfg = await Load(repo);
+        expect(cfg.Markdown.min_prose_words).toBe(Default().Markdown.min_prose_words);
+        // ignored_dirs must be unchanged (no spurious *_extra append of nothing).
+        expect(cfg.Markdown.ignored_dirs).toEqual(Default().Markdown.ignored_dirs);
+    });
+    // WHY: unknown keys are tolerated (yaml.v3 has no strict mode); a typo'd or
+    // forward-compatible key must not fail the load, so older binaries read newer
+    // configs.
+    it("ignores unknown keys", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "markdown:\n  bogus_key: 1\nfuture_section:\n  x: 1\n");
+        const cfg = await Load(repo);
+        expect(cfg.Markdown.min_prose_words).toBe(Default().Markdown.min_prose_words);
+    });
+    // WHY: every section's scalar overlays must reach the right field; if the
+    // overlay mis-wired a section, a user's tuning would silently apply elsewhere
+    // or nowhere. This exercises the indexer/report/embedder/block scalar paths.
+    it("overlays scalars across indexer, report, embedder, and block sections", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, [
+            "indexer:",
+            "  embed_progress_threshold: 5",
+            "  max_workers: 2",
+            "report:",
+            "  preview_chars: 100",
+            "  preview_word_ratio: 0.5",
+            "  wrap_cols: 80",
+            "  separator: '***'",
+            "embedder:",
+            "  batch_size: 16",
+            "block:",
+            "  min_words: 4",
+            "  table_min_rows: 2",
+            "  cosine_threshold: 0.8",
+            "  multiplicity_cap: 5",
+            "",
+        ].join("\n"));
+        const cfg = await Load(repo);
+        expect(cfg.Indexer.embed_progress_threshold).toBe(5);
+        expect(cfg.Indexer.max_workers).toBe(2);
+        expect(cfg.Report.preview_chars).toBe(100);
+        expect(cfg.Report.preview_word_ratio).toBe(0.5);
+        expect(cfg.Report.wrap_cols).toBe(80);
+        expect(cfg.Report.separator).toBe("***");
+        expect(cfg.Embedder.batch_size).toBe(16);
+        expect(cfg.Block.min_words).toBe(4);
+        expect(cfg.Block.table_min_rows).toBe(2);
+        expect(cfg.Block.cosine_threshold).toBe(0.8);
+        expect(cfg.Block.multiplicity_cap).toBe(5);
+    });
+    // WHY: the analyzer's remaining scalar/list knobs (distinctive bounds, the
+    // stopword / path lists) must overlay correctly; these gate the analyzer's
+    // distinctiveness and priority logic.
+    it("overlays the remaining analyzer scalars and list fields", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, [
+            "analyzer:",
+            "  distinctive_abs_min: 5",
+            "  distinctive_pct_of_headings: 0.1",
+            "  universal_stopwords: ['x', 'y']",
+            "  path_priority: ['docs/x/']",
+            "  path_blacklist: ['legacy']",
+            "",
+        ].join("\n"));
+        const cfg = await Load(repo);
+        expect(cfg.Analyzer.distinctive_abs_min).toBe(5);
+        expect(cfg.Analyzer.distinctive_pct_of_headings).toBe(0.1);
+        expect(cfg.Analyzer.universal_stopwords).toEqual(["x", "y"]);
+        expect(cfg.Analyzer.path_priority).toEqual(["docs/x/"]);
+        expect(cfg.Analyzer.path_blacklist).toEqual(["legacy"]);
+    });
+    // WHY: markdown's own scalar/list knobs (hidden_dir_prefix,
+    // heading_token_min_len, ignored_dirs replace) must overlay; the indexer's
+    // dir-skip and heading tokenizer depend on these.
+    it("overlays markdown scalar and list fields", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, [
+            "markdown:",
+            "  heading_token_min_len: 4",
+            "  hidden_dir_prefix: '_'",
+            "  ignored_dirs: ['only']",
+            "",
+        ].join("\n"));
+        const cfg = await Load(repo);
+        expect(cfg.Markdown.heading_token_min_len).toBe(4);
+        expect(cfg.Markdown.hidden_dir_prefix).toBe("_");
+        expect(cfg.Markdown.ignored_dirs).toEqual(["only"]);
+    });
+    // WHY: indexer.external_url_prefixes is a list that replaces; the link
+    // resolver uses it to skip external URLs, so a user override must take effect.
+    it("replaces indexer.external_url_prefixes", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "indexer:\n  external_url_prefixes: ['ftp://']\n");
+        const cfg = await Load(repo);
+        expect(cfg.Indexer.external_url_prefixes).toEqual(["ftp://"]);
+    });
+    // WHY: differentiators are pairs of strings the analyzer uses to keep opposite
+    // concepts apart; a well-formed pair list must parse into [a,b] tuples.
+    it("parses analyzer.differentiators as string pairs", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "analyzer:\n  differentiators:\n    - ['a', 'b']\n    - ['c', 'd']\n");
+        const cfg = await Load(repo);
+        expect(cfg.Analyzer.differentiators).toEqual([
+            ["a", "b"],
+            ["c", "d"],
+        ]);
+    });
+    // WHY: a differentiator entry that is not a 2-element sequence is malformed;
+    // it must throw a matchable type error so the user fixes the pair, not a
+    // confusing failure later in the analyzer.
+    it("throws YAMLTypeError when a differentiator is not a 2-element sequence", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "analyzer:\n  differentiators:\n    - ['only-one']\n");
+        await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
+    });
+    // WHY: a differentiator pair whose elements are not both strings is malformed;
+    // must be a matchable type error.
+    it("throws YAMLTypeError when a differentiator element is not a string", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "analyzer:\n  differentiators:\n    - ['a', 5]\n");
+        await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
+    });
+    // WHY: a list field given a non-sequence value is a type error; a scalar where
+    // a list is expected must throw matchably, not coerce.
+    it("throws YAMLTypeError when a list field is given a scalar", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "markdown:\n  ignored_dirs: oops\n");
+        await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
+    });
+    // WHY: a list element of the wrong type (number inside a string list) must be
+    // caught per-element with the index in the message, so the user finds the bad
+    // entry.
+    it("throws YAMLTypeError when a list element is the wrong type", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "markdown:\n  ignored_dirs: ['ok', 5]\n");
+        await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
+    });
+    // WHY: a string field given a non-string (number) must throw matchably; the
+    // separator is rendered verbatim into reports, so a number would corrupt them.
+    it("throws YAMLTypeError when a string field is given a number", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "report:\n  separator: 5\n");
+        await expect(Load(repo)).rejects.toBeInstanceOf(YAMLTypeError);
+    });
+    // WHY: heading_blacklist_extra appends to the (possibly replaced) base list,
+    // mirroring ignored_dirs_extra; the analyzer's heading filter must see both
+    // the base and the user's additions.
+    it("appends analyzer.heading_blacklist_extra to the base list", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "analyzer:\n  heading_blacklist_extra: ['custom-h']\n");
+        const cfg = await Load(repo);
+        const base = Default().Analyzer.heading_blacklist;
+        expect(cfg.Analyzer.heading_blacklist).toHaveLength(base.length + 1);
+        expect(cfg.Analyzer.heading_blacklist.at(-1)).toBe("custom-h");
+    });
+    // WHY: *_extra appends to the REPLACED list, not the default — a user who both
+    // replaces and extends must get exactly their replacement plus their extras.
+    it("appends *_extra onto a replaced base list (not the default)", async () => {
+        const repo = newRepo();
+        writeConfigYML(repo, "markdown:\n  ignored_dirs: ['base']\n  ignored_dirs_extra: ['more']\n");
+        const cfg = await Load(repo);
+        expect(cfg.Markdown.ignored_dirs).toEqual(["base", "more"]);
+    });
+    // WHY: validation guards the remaining ranges too; an out-of-range value for
+    // any of these must fail at load rather than misbehave deep in the pipeline.
+    it.each([
+        ["heading_token_min_len = 0", "markdown:\n  heading_token_min_len: 0\n"],
+        ["distinctive_abs_min = 0", "analyzer:\n  distinctive_abs_min: 0\n"],
+        [
+            "distinctive_pct_of_headings = 0",
+            "analyzer:\n  distinctive_pct_of_headings: 0\n",
+        ],
+        [
+            "distinctive_pct_of_headings > 1",
+            "analyzer:\n  distinctive_pct_of_headings: 1.5\n",
+        ],
+        ["preview_chars = 0", "report:\n  preview_chars: 0\n"],
+        ["wrap_cols = 0", "report:\n  wrap_cols: 0\n"],
+        ["thresh_maybe = 0", "analyzer:\n  thresh_maybe: 0\n"],
+    ])("rejects out-of-range value: %s", async (_name, yaml) => {
+        const repo = newRepo();
+        writeConfigYML(repo, yaml);
+        await expect(Load(repo)).rejects.toThrow();
+    });
+    // WHY: an I/O fault other than "file missing" (here: config.yml is a
+    // directory, which yields EISDIR on read) must surface as a wrapped error with
+    // the cause attached — never silently fall back to defaults (Rule 12).
+    it("throws (not falls back) when config.yml cannot be read for a non-ENOENT reason", async () => {
+        const repo = newRepo();
+        const dedupDir = join(repo, ".docgov", "dedup");
+        mkdirSync(dedupDir, { recursive: true });
+        // Make config.yml a directory so reading it fails with EISDIR, not ENOENT.
+        mkdirSync(join(dedupDir, "config.yml"));
+        await expect(Load(repo)).rejects.toThrow(/configload\.Load: read/);
+    });
+});

package/dist/dedup/dedup.index.test.js ADDED Viewed

@@ -0,0 +1,203 @@
+// Drives the REAL dedup Index() facade end-to-end with the transformers model
+// mocked, so the whole Index path (config load, docs walk, section/block
+// extraction, embed via the real Embedder/Session, persist to index.db) runs
+// against fake vectors with no ~1GB download.
+//
+// vi.mock is hoisted above the imports and intercepts the dynamic
+// `await import("@huggingface/transformers")` inside session.ts, which
+// Embedder.newEmbedder (called by Index) reaches through. The real-embedder
+// Index path is otherwise only covered by the env-gated smoke test in
+// dedup.test.ts; this is the deterministic coverage of the facade.
+import { existsSync, mkdtempSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { dirname, join } from "node:path";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { Dimension } from "./embedder/constants.js";
+import { ErrIndexMissing } from "./indexdb/index.js";
+import { Analyze, Index } from "./dedup.js";
+// pipelineThrows steers the mocked model factory to fail, exercising Index's
+// "new embedder" failure path.
+const mockState = vi.hoisted(() => ({
+    env: { cacheDir: "" },
+    pipelineThrows: null,
+}));
+// l2row: a deterministic unit vector with its 1.0 at (i % dim). Orthogonal
+// across batch positions so distinct sections never collapse into a HIGH group;
+// L2-normalized so reshape's dimension guard passes and the analyzer's
+// dot-product-as-cosine identity holds.
+function l2row(i, dim) {
+    const v = new Array(dim).fill(0);
+    v[i % dim] = 1.0;
+    return v;
+}
+vi.mock("@huggingface/transformers", () => {
+    const env = mockState.env;
+    const pipeline = async (_task, _model) => {
+        if (mockState.pipelineThrows)
+            throw mockState.pipelineThrows;
+        return async (texts, _opts) => {
+            const rows = texts.map((_, i) => l2row(i, Dimension));
+            return {
+                data: new Float32Array(rows.flat()),
+                dims: [texts.length, Dimension],
+                tolist: () => rows,
+            };
+        };
+    };
+    return { pipeline, env };
+});
+// --- temp dir plumbing -----------------------------------------------------
+const tmpDirs = [];
+let savedModelCache;
+function newRepoRoot() {
+    const dir = mkdtempSync(join(tmpdir(), "dedup-index-"));
+    tmpDirs.push(dir);
+    return dir;
+}
+function writeFiles(repoRoot, files) {
+    for (const [rel, content] of Object.entries(files)) {
+        const full = join(repoRoot, rel);
+        mkdirSync(dirname(full), { recursive: true });
+        writeFileSync(full, content);
+    }
+}
+function dbPathOf(repoRoot) {
+    return join(repoRoot, ".docgov", "dedup", "index.db");
+}
+const noProgress = () => { };
+beforeEach(() => {
+    mockState.env.cacheDir = "";
+    mockState.pipelineThrows = null;
+    // Index() calls Embedder.newEmbedder() with no explicit cacheDir, which would
+    // resolve to the host ~/.cache. Point it at a throwaway temp dir so the test
+    // never writes to the real home cache.
+    savedModelCache = process.env.DOCGOV_MODEL_CACHE;
+    const modelCache = mkdtempSync(join(tmpdir(), "dedup-index-model-"));
+    tmpDirs.push(modelCache);
+    process.env.DOCGOV_MODEL_CACHE = modelCache;
+});
+afterEach(() => {
+    if (savedModelCache === undefined)
+        delete process.env.DOCGOV_MODEL_CACHE;
+    else
+        process.env.DOCGOV_MODEL_CACHE = savedModelCache;
+    for (const d of tmpDirs.splice(0))
+        rmSync(d, { recursive: true, force: true });
+});
+describe("dedup.Index (mocked model)", () => {
+    // WHY: this is the linchpin — it runs the entire real Index() facade (walk +
+    // extract + embed via the real Embedder/Session + persist) with only the
+    // model mocked. A regression anywhere on that path (config, walk, embed
+    // wiring, DB write) would change the returned stats or leave no DB; both are
+    // asserted. Then Analyze must read that persisted state without throwing
+    // ErrIndexMissing, proving the index round-trips.
+    it("indexes a docs corpus, persists the DB, and Analyze reads it back", async () => {
+        const repoRoot = newRepoRoot();
+        const aIntro = "Alpha covers the ingestion side of the system, including the upload queue, the validation gateway, and the retry scheduler for failed batches.";
+        const bIntro = "Beta documents the reporting surface, covering scheduled exports, the metrics rollup job, and the long-term archival of historical aggregates.";
+        writeFiles(repoRoot, {
+            "docs/a.md": `## Alpha Overview\n\n${aIntro}\n`,
+            "docs/b.md": `## Beta Overview\n\n${bIntro}\n`,
+        });
+        const stats = await Index(repoRoot, noProgress);
+        // Two eligible sections, both newly embedded this run, nothing pruned from
+        // a fresh DB.
+        expect(stats.sections).toBe(2);
+        expect(stats.embedded).toBe(2);
+        expect(stats.pruned).toBe(0);
+        // The facade persisted the index to its canonical location.
+        expect(existsSync(dbPathOf(repoRoot))).toBe(true);
+        // Analyze over the just-built index resolves (not ErrIndexMissing). The two
+        // sections are orthogonal, so there are no HIGH duplicate groups.
+        const report = await Analyze(repoRoot);
+        expect(report.HighGroups).toHaveLength(0);
+    });
+    // WHY: a second Index run over unchanged docs must re-embed nothing (the
+    // content hashes match the persisted rows) while still reporting the full
+    // live section count. This pins the incremental-embed wiring that keeps
+    // re-indexing cheap; a regression that re-embedded everything would silently
+    // make every run pay the full cost.
+    it("re-indexing unchanged docs embeds nothing the second time", async () => {
+        const repoRoot = newRepoRoot();
+        writeFiles(repoRoot, {
+            "docs/guide.md": "## Introduction\n\nThis is the introduction section of the guide and it covers the basic concepts and provides a thorough overview of the whole system.\n",
+        });
+        const first = await Index(repoRoot, noProgress);
+        expect(first.sections).toBe(1);
+        expect(first.embedded).toBe(1);
+        const second = await Index(repoRoot, noProgress);
+        expect(second.sections).toBe(1);
+        expect(second.embedded).toBe(0);
+        expect(second.pruned).toBe(0);
+    });
+});
+describe("dedup.Index error wiring", () => {
+    // WHY: Index must fail BEFORE touching the model when config is invalid, and
+    // it must wrap the cause so the CLI reports "load config" rather than a bare
+    // validation message. A regression that loaded defaults on bad config would
+    // silently mask the user's broken config.yml.
+    it("wraps a config-load failure and never reaches the embedder", async () => {
+        const repoRoot = newRepoRoot();
+        writeFiles(repoRoot, {
+            // thresh_high out of (0,1) range → validate() throws inside Load.
+            ".docgov/dedup/config.yml": "analyzer:\n  thresh_high: 5\n",
+            "docs/a.md": "## A\n\nThis is a sufficiently long paragraph of prose used purely to make the section eligible.\n",
+        });
+        await expect(Index(repoRoot, noProgress)).rejects.toThrow(/dedup\.Index: load config/);
+    });
+    // WHY: a model download/init failure must propagate as a wrapped "new
+    // embedder" error (not a raw transformers error and not a silent empty
+    // index), so the CLI can tell the user the model could not be loaded.
+    it("wraps an embedder-init failure", async () => {
+        const repoRoot = newRepoRoot();
+        writeFiles(repoRoot, {
+            "docs/a.md": "## A\n\nThis is a sufficiently long paragraph of prose used purely to make the section eligible.\n",
+        });
+        mockState.pipelineThrows = new Error("model fetch failed");
+        await expect(Index(repoRoot, noProgress)).rejects.toThrow(/dedup\.Index: new embedder/);
+    });
+    // WHY: a corrupt DB file must surface as a wrapped "open db" error, not a
+    // confusing low-level SQLite message — this is the open-failure branch the
+    // facade exists to translate.
+    it("wraps an open-db failure on a corrupt index file", async () => {
+        const repoRoot = newRepoRoot();
+        writeFiles(repoRoot, {
+            ".docgov/dedup/index.db": "this is not a sqlite database",
+            "docs/a.md": "## A\n\nThis is a sufficiently long paragraph of prose used purely to make the section eligible.\n",
+        });
+        await expect(Index(repoRoot, noProgress)).rejects.toThrow(/dedup\.Index: open db/);
+    });
+});
+describe("dedup.Analyze error wiring", () => {
+    // WHY: a missing index must throw the matchable ErrIndexMissing sentinel so
+    // the CLI prompts "run index first" rather than reporting a generic failure.
+    it("throws ErrIndexMissing when no index DB exists", async () => {
+        const repoRoot = newRepoRoot();
+        await expect(Analyze(repoRoot)).rejects.toBeInstanceOf(ErrIndexMissing);
+    });
+    // WHY: with a DB present, Analyze must still validate config first and wrap a
+    // config-load failure — the same broken-config signal as Index, on the read
+    // path.
+    it("wraps a config-load failure when the index exists", async () => {
+        const repoRoot = newRepoRoot();
+        // Build a real index first so the DB exists, then corrupt the config.
+        writeFiles(repoRoot, {
+            "docs/a.md": "## A\n\nThis is a sufficiently long paragraph of prose used purely to make the section eligible.\n",
+        });
+        await Index(repoRoot, noProgress);
+        writeFiles(repoRoot, {
+            ".docgov/dedup/config.yml": "analyzer:\n  thresh_high: 5\n",
+        });
+        await expect(Analyze(repoRoot)).rejects.toThrow(/dedup\.Analyze: load config/);
+    });
+    // WHY: a corrupt DB on the read path must surface as a wrapped "open db"
+    // error, not a raw SQLite message — the facade's open-failure translation on
+    // Analyze.
+    it("wraps an open-db failure on a corrupt index file", async () => {
+        const repoRoot = newRepoRoot();
+        writeFiles(repoRoot, {
+            ".docgov/dedup/index.db": "this is not a sqlite database",
+        });
+        await expect(Analyze(repoRoot)).rejects.toThrow(/dedup\.Analyze: open db/);
+    });
+});