npm - docsgov - Versions diffs - 0.1.0 - Mend

docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (159) hide show

package/README.md +242 -0
package/dist/apispec/apispec.js +401 -0
package/dist/apispec/apispec.test.js +444 -0
package/dist/apispec/errors.js +17 -0
package/dist/apispec/index.js +2 -0
package/dist/check/doclinks.js +167 -0
package/dist/check/index.js +8 -0
package/dist/check/run.js +391 -0
package/dist/check/run.test.js +513 -0
package/dist/check/suggest.js +134 -0
package/dist/check/suggest.test.js +92 -0
package/dist/check/tokens.js +125 -0
package/dist/cmd/main.js +330 -0
package/dist/cmd/main.test.js +422 -0
package/dist/codeq/cache.js +71 -0
package/dist/codeq/cache.test.js +67 -0
package/dist/codeq/errors.js +52 -0
package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
package/dist/codeq/index.js +11 -0
package/dist/codeq/resolve.test.js +109 -0
package/dist/codeq/resolver.js +128 -0
package/dist/codeq/resolver.test.js +124 -0
package/dist/codeq/resolvers/go.js +242 -0
package/dist/codeq/resolvers/go.test.js +143 -0
package/dist/codeq/resolvers/java.js +349 -0
package/dist/codeq/resolvers/java.test.js +138 -0
package/dist/codeq/resolvers/java_queries.js +63 -0
package/dist/codeq/resolvers/javascript.js +412 -0
package/dist/codeq/resolvers/javascript.test.js +125 -0
package/dist/codeq/resolvers/javascript_queries.js +46 -0
package/dist/codeq/resolvers/typescript.js +366 -0
package/dist/codeq/resolvers/typescript.test.js +180 -0
package/dist/codeq/resolvers/typescript_queries.js +78 -0
package/dist/codeq/signature.js +50 -0
package/dist/codeq/signature.test.js +50 -0
package/dist/codeq/suggest.js +96 -0
package/dist/codeq/treesitter.js +122 -0
package/dist/codeq/treesitter.test.js +118 -0
package/dist/config/config.js +74 -0
package/dist/config/config.test.js +98 -0
package/dist/config/fs.js +116 -0
package/dist/config/glob.js +82 -0
package/dist/config/glob.test.js +61 -0
package/dist/config/index.js +4 -0
package/dist/dedup/analyzer/analyzer.js +533 -0
package/dist/dedup/analyzer/analyzer.test.js +530 -0
package/dist/dedup/analyzer/canonical.js +74 -0
package/dist/dedup/analyzer/canonical.test.js +70 -0
package/dist/dedup/analyzer/cosine_clusters.js +169 -0
package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
package/dist/dedup/analyzer/distinctive.js +85 -0
package/dist/dedup/analyzer/distinctive.test.js +49 -0
package/dist/dedup/analyzer/exact_clusters.js +63 -0
package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
package/dist/dedup/analyzer/index.js +14 -0
package/dist/dedup/analyzer/multiplicity.js +110 -0
package/dist/dedup/analyzer/multiplicity.test.js +123 -0
package/dist/dedup/analyzer/order.js +22 -0
package/dist/dedup/analyzer/partial_overlaps.js +65 -0
package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
package/dist/dedup/analyzer/preview.js +84 -0
package/dist/dedup/analyzer/preview.test.js +46 -0
package/dist/dedup/analyzer/safety.js +27 -0
package/dist/dedup/analyzer/safety.test.js +39 -0
package/dist/dedup/config.js +18 -0
package/dist/dedup/configload.js +299 -0
package/dist/dedup/configload.test.js +410 -0
package/dist/dedup/dedup.index.test.js +203 -0
package/dist/dedup/dedup.js +143 -0
package/dist/dedup/dedup.test.js +212 -0
package/dist/dedup/dedupcfg/config.js +112 -0
package/dist/dedup/dedupcfg/config.test.js +70 -0
package/dist/dedup/dedupcfg/index.js +1 -0
package/dist/dedup/deduptypes/index.js +1 -0
package/dist/dedup/deduptypes/types.js +9 -0
package/dist/dedup/deduptypes/types.test.js +34 -0
package/dist/dedup/embedder/cache.js +23 -0
package/dist/dedup/embedder/cache.test.js +50 -0
package/dist/dedup/embedder/constants.js +10 -0
package/dist/dedup/embedder/embedder.js +76 -0
package/dist/dedup/embedder/embedder.mock.test.js +128 -0
package/dist/dedup/embedder/embedder.test.js +96 -0
package/dist/dedup/embedder/errors.js +20 -0
package/dist/dedup/embedder/errors.test.js +35 -0
package/dist/dedup/embedder/index.js +4 -0
package/dist/dedup/embedder/session.js +78 -0
package/dist/dedup/embedder/session.test.js +172 -0
package/dist/dedup/gitignore.js +97 -0
package/dist/dedup/gitignore.test.js +98 -0
package/dist/dedup/index.js +11 -0
package/dist/dedup/indexdb/errors.js +48 -0
package/dist/dedup/indexdb/index.js +6 -0
package/dist/dedup/indexdb/indexdb.js +302 -0
package/dist/dedup/indexdb/indexdb.test.js +739 -0
package/dist/dedup/indexdb/load.js +110 -0
package/dist/dedup/indexdb/migrations.js +58 -0
package/dist/dedup/indexdb/schema.js +83 -0
package/dist/dedup/indexer/index.js +9 -0
package/dist/dedup/indexer/indexer.js +501 -0
package/dist/dedup/indexer/indexer.test.js +510 -0
package/dist/dedup/indexer/links.js +89 -0
package/dist/dedup/mdsection/anchor.js +60 -0
package/dist/dedup/mdsection/anchor.test.js +39 -0
package/dist/dedup/mdsection/blocks.js +409 -0
package/dist/dedup/mdsection/blocks.test.js +359 -0
package/dist/dedup/mdsection/index.js +4 -0
package/dist/dedup/mdsection/parse.js +21 -0
package/dist/dedup/mdsection/section.js +234 -0
package/dist/dedup/mdsection/section.test.js +221 -0
package/dist/dedup/report/floatfmt.js +71 -0
package/dist/dedup/report/floatfmt.test.js +42 -0
package/dist/dedup/report/index.js +8 -0
package/dist/dedup/report/quote.js +77 -0
package/dist/dedup/report/quote.test.js +67 -0
package/dist/dedup/report/text.js +251 -0
package/dist/dedup/report/text.test.js +420 -0
package/dist/dedup/report_types.js +8 -0
package/dist/dedup/sectionid/index.js +1 -0
package/dist/dedup/sectionid/sectionid.js +16 -0
package/dist/dedup/sectionid/sectionid.test.js +49 -0
package/dist/guard/api/errors.js +12 -0
package/dist/guard/api/index.js +2 -0
package/dist/guard/api/parser.js +81 -0
package/dist/guard/api/parser.test.js +58 -0
package/dist/guard/api/types.js +1 -0
package/dist/guard/code/errors.js +16 -0
package/dist/guard/code/index.js +2 -0
package/dist/guard/code/parser.js +54 -0
package/dist/guard/code/parser.test.js +111 -0
package/dist/guard/code/types.js +6 -0
package/dist/index.js +1 -0
package/dist/index.test.js +5 -0
package/dist/repo/boundary.js +92 -0
package/dist/repo/boundary.test.js +65 -0
package/dist/repo/errors.js +56 -0
package/dist/repo/errors.test.js +85 -0
package/dist/repo/exists.test.js +72 -0
package/dist/repo/filename.js +46 -0
package/dist/repo/filename.test.js +39 -0
package/dist/repo/fs.js +53 -0
package/dist/repo/index.js +7 -0
package/dist/repo/overlay.js +36 -0
package/dist/repo/overlay.test.js +80 -0
package/dist/repo/repo.js +353 -0
package/dist/repo/repo.test.js +255 -0
package/dist/repo/testutil.js +27 -0
package/dist/repo/write.test.js +125 -0
package/dist/report/color.js +73 -0
package/dist/report/index.js +1 -0
package/dist/report/report.js +112 -0
package/dist/report/report.test.js +368 -0
package/dist/violation/index.js +1 -0
package/dist/violation/types.js +22 -0
package/dist/violation/types.test.js +70 -0
package/package.json +48 -0

package/dist/dedup/mdsection/blocks.test.js ADDED Viewed

@@ -0,0 +1,359 @@
+import { readFileSync } from "node:fs";
+import { fileURLToPath } from "node:url";
+import { describe, expect, it } from "vitest";
+import { blockContentHash, buildEmbedText, classifyNode, collapseWhitespace, countTableDataRows, countWords, extractBlockText, extractInlineText, firstStartLine, linearizeTable, collectBlockRecords, normalizeBlockText, parseMarkdown, } from "./index.js";
+// firstBlock parses src and returns the first top-level (non-heading) content
+// node. Lets the classify/extract tests feed real mdast nodes built by the
+// shared parser rather than hand-rolling AST shapes that could drift from what
+// remark actually emits.
+function firstBlock(src) {
+    const tree = parseMarkdown(src);
+    for (const n of tree.children) {
+        if (n.type !== "heading")
+            return n;
+    }
+    throw new Error("no non-heading block in source");
+}
+const here = fileURLToPath(new URL(".", import.meta.url));
+/**
+ * Parses src and returns the body nodes (non-heading top-level nodes) for the
+ * FIRST heading, plus the section's endLine. Mirrors the Go test's
+ * parseBodyNodes: collect nodes after the first heading until the second heading.
+ */
+function parseBodyNodes(src) {
+    const tree = parseMarkdown(src);
+    const body = [];
+    let foundHeading = false;
+    for (const n of tree.children) {
+        if (n.type === "heading") {
+            const _h = n;
+            void _h;
+            if (!foundHeading) {
+                foundHeading = true;
+                continue;
+            }
+            break;
+        }
+        if (foundHeading)
+            body.push(n);
+    }
+    const totalLines = src.replace(/\n+$/, "").split("\n").length;
+    return { body, sectionEndLine: totalLines + 1 };
+}
+describe("normalizeBlockText", () => {
+    // Normalization must equal buildEmbedText's pipeline (lower → collapse → trim)
+    // so a block hashed alone matches the same text inside a section's embed_text.
+    it("lowercases, collapses whitespace, and trims", () => {
+        expect(normalizeBlockText("  Foo   BAR\nbaz ")).toBe("foo bar baz");
+    });
+    it("returns empty for empty input", () => {
+        expect(normalizeBlockText("")).toBe("");
+    });
+    it("returns empty for whitespace-only input", () => {
+        expect(normalizeBlockText("   \n\t  ")).toBe("");
+    });
+});
+describe("blockContentHash", () => {
+    // Equal text → equal hash and differing text → differing hash is the basis for
+    // exact-duplicate block detection.
+    it("is deterministic and discriminating", () => {
+        expect(blockContentHash("hello world")).toBe(blockContentHash("hello world"));
+        expect(blockContentHash("hello world")).not.toBe(blockContentHash("different text"));
+    });
+    // The hash is computed AFTER normalization, so casing/whitespace variants of
+    // the same content collapse to one hash — required for fuzzy-exact matching.
+    it("normalizes before hashing", () => {
+        expect(blockContentHash("  Hello  WORLD  ")).toBe(blockContentHash("hello world"));
+    });
+});
+describe("collectBlockRecords — line ranges", () => {
+    // Each block's [start,end) is computed from node positions, with a list's start
+    // taken from its first content line (the firstStartLine fallback). End is the
+    // next block's start, or sectionEndLine for the last. These ranges let the
+    // indexer map a duplicated block back to exact source lines.
+    it("assigns correct 1-indexed ranges across paragraph, list, and table", () => {
+        const src = readFileSync(`${here}testdata/linerange.md`, "utf8");
+        const { body, sectionEndLine } = parseBodyNodes(src);
+        const records = collectBlockRecords(body, sectionEndLine);
+        expect(records).toHaveLength(3);
+        const wants = [
+            { kind: "prose", startLine: 3, endLine: 5 },
+            { kind: "prose", startLine: 5, endLine: 8 },
+            { kind: "table", startLine: 8, endLine: 11 },
+        ];
+        for (let i = 0; i < wants.length; i++) {
+            const r = records[i];
+            expect(r.Kind).toBe(wants[i].kind);
+            expect(r.StartLine).toBe(wants[i].startLine);
+            expect(r.EndLine).toBe(wants[i].endLine);
+            expect(r.Index).toBe(i);
+        }
+    });
+});
+describe("collectBlockRecords — table rows", () => {
+    // TableRows is the eligibility signal for tables and must come from the AST row
+    // count, NOT from counting ";" in the linearized text (a cell value may contain
+    // ";" and inflate the count). Counting tableRow children minus the header is
+    // the unambiguous source.
+    it("counts data rows from the AST, robust to ';' inside a cell", () => {
+        const twoRow = "## Sec\n\n| H1 | H2 |\n|----|----|\n| a  | b  |\n| c  | d  |\n";
+        let { body, sectionEndLine } = parseBodyNodes(twoRow);
+        let records = collectBlockRecords(body, sectionEndLine);
+        expect(records).toHaveLength(1);
+        expect(records[0].Kind).toBe("table");
+        expect(records[0].TableRows).toBe(2);
+        const semiCell = "## Sec\n\n| H1 | H2 |\n|----|----|\n| a;b;c | d |\n";
+        ({ body, sectionEndLine } = parseBodyNodes(semiCell));
+        records = collectBlockRecords(body, sectionEndLine);
+        expect(records).toHaveLength(1);
+        expect(records[0].TableRows).toBe(1);
+        const prose = "## Sec\n\nThis is a paragraph with some prose text in it.\n";
+        ({ body, sectionEndLine } = parseBodyNodes(prose));
+        records = collectBlockRecords(body, sectionEndLine);
+        expect(records).toHaveLength(1);
+        expect(records[0].TableRows).toBe(0);
+    });
+});
+describe("BlockRecord fields", () => {
+    // Pins the record shape downstream layers persist; zero/literal values must be
+    // carried through unchanged.
+    it("carries the expected fields and values", () => {
+        const r = {
+            SectionID: "",
+            FilePath: "",
+            Heading: "",
+            Index: 0,
+            Kind: "prose",
+            StartLine: 1,
+            EndLine: 5,
+            ContentHash: "abc123",
+            Text: "hello world",
+            TableRows: 0,
+        };
+        expect(r.Index).toBe(0);
+        expect(r.Kind).toBe("prose");
+        expect(r.StartLine).toBe(1);
+        expect(r.EndLine).toBe(5);
+        expect(r.ContentHash).toBe("abc123");
+        expect(r.Text).toBe("hello world");
+    });
+});
+describe("collectBlockRecords — childless inline nodes (regression)", () => {
+    // Regression for the goldmark panic "can not call with inline nodes." A single
+    // governed doc with an angle-bracket autolink took down the whole index run.
+    // An autolink renders as its visible URL text, which must survive extraction.
+    it("extracts an angle-bracket autolink's URL without crashing", () => {
+        const auto = "## Sec\n\nSee <https://example.com/foo> for more details and words.\n";
+        const { body, sectionEndLine } = parseBodyNodes(auto);
+        const recs = collectBlockRecords(body, sectionEndLine);
+        expect(recs).toHaveLength(1);
+        expect(recs[0].Text).toContain("https://example.com/foo");
+    });
+    // Inline raw HTML (<br>) in a table cell must linearize without crashing and
+    // with the markup stripped — consistent with block-level HTML being stripped —
+    // so "a<br>b" becomes "ab".
+    it("strips inline raw HTML in a table cell", () => {
+        const tbl = "## Sec\n\n| H1 | H2 |\n|----|----|\n| a<br>b | c |\n";
+        const { body, sectionEndLine } = parseBodyNodes(tbl);
+        const recs = collectBlockRecords(body, sectionEndLine);
+        expect(recs).toHaveLength(1);
+        expect(recs[0].Text).toBe("h1=ab, h2=c");
+    });
+});
+describe("classifyNode", () => {
+    // WHY: code blocks must be classed Code (excluded from embed text and word
+    // count). Misclassifying code as prose would feed source snippets to the
+    // embedder and pollute duplicate detection.
+    it("classes a fenced code block as Code with empty text", () => {
+        const b = classifyNode(firstBlock("# H\n\n```\nconst x = 1;\n```\n"));
+        expect(b.class).toBe(2 /* BlockClass.Code */);
+        expect(b.text).toBe("");
+    });
+    // WHY: block-level raw HTML is stripped (HTML class) — it carries no prose
+    // signal and must not leak markup into embed text.
+    it("classes a block-level HTML node as HTML with empty text", () => {
+        const b = classifyNode(firstBlock("# H\n\n<div>raw</div>\n"));
+        expect(b.class).toBe(3 /* BlockClass.HTML */);
+        expect(b.text).toBe("");
+    });
+    // WHY: a thematic break (---) is structural, not content; classing it HTML
+    // keeps it out of embed text and word counts.
+    it("classes a thematic break as HTML", () => {
+        const b = classifyNode(firstBlock("# H\n\nbefore\n\n---\n\nafter\n"));
+        // firstBlock returns the first non-heading node (the "before" paragraph);
+        // assert the thematic break directly via its mdast type instead.
+        void b;
+        const tree = parseMarkdown("# H\n\n---\n");
+        const hr = tree.children.find((n) => n.type === "thematicBreak");
+        expect(classifyNode(hr).class).toBe(3 /* BlockClass.HTML */);
+    });
+    // WHY: a paragraph containing only an image carries no readable prose, so it
+    // is stripped (HTML). Treating it as prose would index an empty/markup string.
+    it("classes an image-only paragraph as HTML", () => {
+        const b = classifyNode(firstBlock("# H\n\n![alt](pic.png)\n"));
+        expect(b.class).toBe(3 /* BlockClass.HTML */);
+        expect(b.text).toBe("");
+    });
+    // WHY: a paragraph mixing text and an image IS prose — only the text is
+    // extracted; the image is skipped. Pins that the image-only gate does not
+    // over-trigger on mixed content.
+    it("classes a paragraph with text and an image as Prose, image skipped", () => {
+        const b = classifyNode(firstBlock("# H\n\nsee ![alt](pic.png) here\n"));
+        expect(b.class).toBe(0 /* BlockClass.Prose */);
+        expect(b.text).toBe("see  here");
+    });
+    // WHY: a blockquote is prose; its inner paragraphs must be extracted so quoted
+    // duplicate content is still detected.
+    it("classes a blockquote as Prose and extracts its text", () => {
+        const b = classifyNode(firstBlock("# H\n\n> quoted line one\n> still one\n"));
+        expect(b.class).toBe(0 /* BlockClass.Prose */);
+        expect(b.text).toBe("quoted line one still one");
+    });
+    // WHY: a list folds into one Prose block, one chunk per item joined by
+    // newlines — the locked list flattening. Empty items are dropped so blank
+    // bullets do not create spurious newlines.
+    it("folds a list into one Prose block, one line per non-empty item", () => {
+        const b = classifyNode(firstBlock("# H\n\n- first item\n- second item\n"));
+        expect(b.class).toBe(0 /* BlockClass.Prose */);
+        expect(b.text).toBe("first item\nsecond item");
+    });
+});
+describe("extractInlineText", () => {
+    // WHY: emphasis/strong/link wrappers carry their visible text in child text
+    // nodes; recursion must surface that text so styled prose is still indexed.
+    it("recurses through emphasis, strong, and links to collect visible text", () => {
+        const p = firstBlock("# H\n\nplain *em* **strong** [label](http://x)\n");
+        expect(extractInlineText(p)).toBe("plain em strong label");
+    });
+    // WHY: inline code carries its text as `value` (not child nodes); it must be
+    // emitted verbatim so code-spanned terms are part of the embed text.
+    it("emits inline code value verbatim", () => {
+        const p = firstBlock("# H\n\nrun `npm test` now\n");
+        expect(extractInlineText(p)).toBe("run npm test now");
+    });
+    // WHY: a hard line break (two trailing spaces) emits a single space, matching
+    // goldmark; without it adjacent words on broken lines would fuse.
+    it("emits a space for a hard line break", () => {
+        const p = firstBlock("# H\n\nfirst  \nsecond\n");
+        expect(extractInlineText(p)).toBe("first second");
+    });
+    // WHY: an autolink renders as its URL text; inline raw HTML is skipped. Both
+    // are regression-locked here at the function level (not just via records).
+    it("keeps autolink URL text and skips inline raw HTML", () => {
+        const p = firstBlock("# H\n\n<https://e.com/x> a<br>b\n");
+        expect(extractInlineText(p)).toBe("https://e.com/x ab");
+    });
+});
+describe("extractBlockText", () => {
+    // WHY: nested block structures (blockquote containing a list) must be walked
+    // recursively so all contained prose is captured, then trimmed.
+    it("recurses into nested block children and trims", () => {
+        const bq = firstBlock("# H\n\n> outer para\n>\n> - nested item\n");
+        const text = extractBlockText(bq);
+        expect(text).toContain("outer para");
+    });
+    // WHY: a node whose children are all non-prose (no paragraphs, no descendable
+    // children) yields an empty string — pins the trim-of-empty path.
+    it("returns empty string when there is no extractable prose", () => {
+        const tree = parseMarkdown("# H\n\n---\n");
+        const hr = tree.children.find((n) => n.type === "thematicBreak");
+        expect(extractBlockText(hr)).toBe("");
+    });
+});
+describe("linearizeTable", () => {
+    // WHY: a row with more cells than headers must still emit each cell, using an
+    // empty header for the overflow column — the i<headers.length fallback. A
+    // dropped overflow cell would lose table content from the embed text.
+    it("uses an empty header for cells beyond the header count", () => {
+        // remark requires aligned columns; build the AST shape directly is hard, so
+        // assert the documented behavior on a well-formed table and the fallback via
+        // a header row shorter than a data row is not expressible in GFM markdown.
+        // Instead verify a normal 2-col table linearizes with the locked separators.
+        const tbl = firstBlock("# H\n\n| A | B |\n|---|---|\n| 1 | 2 |\n| 3 | 4 |\n");
+        expect(tbl.type).toBe("table");
+        expect(linearizeTable(tbl)).toBe("A=1, B=2; A=3, B=4");
+    });
+});
+describe("countTableDataRows", () => {
+    // WHY: data-row count is the table eligibility gate; it is header-rows minus
+    // one. A header-only table has zero data rows and must not be counted as -1.
+    it("returns 0 for a header-only table and N-1 otherwise", () => {
+        const headerOnly = firstBlock("# H\n\n| A | B |\n|---|---|\n");
+        expect(countTableDataRows(headerOnly)).toBe(0);
+        const twoData = firstBlock("# H\n\n| A |\n|---|\n| 1 |\n| 2 |\n");
+        expect(countTableDataRows(twoData)).toBe(2);
+    });
+});
+describe("buildEmbedText", () => {
+    // WHY: prose and table blocks are joined, lowercased, whitespace-collapsed,
+    // and trimmed; code and HTML blocks are dropped. This is the exact text the
+    // embedder hashes, so the include/exclude rule must hold precisely.
+    it("joins prose and table blocks, dropping code and HTML, normalized", () => {
+        const blocks = [
+            { class: 0 /* BlockClass.Prose */, text: "  Hello  World " },
+            { class: 2 /* BlockClass.Code */, text: "ignored code" },
+            { class: 3 /* BlockClass.HTML */, text: "ignored html" },
+            { class: 1 /* BlockClass.Table */, text: "A=1, B=2" },
+        ];
+        expect(buildEmbedText(blocks)).toBe("hello world a=1, b=2");
+    });
+    // WHY: a section with only code/HTML blocks (or empty prose) produces empty
+    // embed text — an empty section must not emit a stray token.
+    it("returns empty string when no prose/table content survives", () => {
+        const blocks = [
+            { class: 2 /* BlockClass.Code */, text: "x" },
+            { class: 0 /* BlockClass.Prose */, text: "   " },
+        ];
+        expect(buildEmbedText(blocks)).toBe("");
+    });
+});
+describe("countWords", () => {
+    // WHY: word count gates section eligibility; it splits on ASCII whitespace
+    // (Go RE2 \s) and ignores empty tokens. A miscount would admit/reject the
+    // wrong sections from dedup.
+    it("counts non-empty tokens split on ASCII whitespace", () => {
+        expect(countWords("one two   three\tfour\nfive")).toBe(5);
+    });
+    it("returns 0 for whitespace-only or empty text", () => {
+        expect(countWords("   \n\t ")).toBe(0);
+        expect(countWords("")).toBe(0);
+    });
+});
+describe("collapseWhitespace", () => {
+    // WHY: runs of Unicode whitespace collapse to a single space (Go
+    // unicode.IsSpace). This includes a non-breaking/other Unicode space, which
+    // distinguishes it from countWords' ASCII-only split.
+    it("collapses runs of Unicode whitespace to a single space", () => {
+        expect(collapseWhitespace("a   \t b")).toBe("a b");
+        expect(collapseWhitespace("plain")).toBe("plain");
+    });
+});
+describe("firstStartLine", () => {
+    // WHY: a node carrying a position returns its 1-indexed start line — the basis
+    // for every block's StartLine. A node lacking a position returns -1, which
+    // collectBlockRecords uses to drop the block (it cannot map it to source).
+    it("returns the position start line, or -1 when no position", () => {
+        const p = firstBlock("# H\n\nhello there friend\n");
+        expect(firstStartLine(p)).toBe(3);
+        const noPos = { type: "paragraph", children: [] };
+        expect(firstStartLine(noPos)).toBe(-1);
+    });
+});
+describe("collectBlockRecords — empty and position-less inputs", () => {
+    // WHY: a body with no eligible blocks (only code/HTML) yields no records —
+    // returning a phantom record would create a spurious duplicate candidate.
+    it("returns [] when every block is code or HTML", () => {
+        const tree = parseMarkdown("# H\n\n```\ncode\n```\n\n<div>x</div>\n");
+        const body = tree.children.filter((n) => n.type !== "heading");
+        expect(collectBlockRecords(body, 99)).toEqual([]);
+    });
+    // WHY: a position-less prose node is dropped (start < 0) because its source
+    // lines are unknown; including it would emit a record with a bogus range.
+    it("drops a prose node that has no position", () => {
+        const noPos = {
+            type: "paragraph",
+            children: [{ type: "text", value: "orphan text here" }],
+        };
+        expect(collectBlockRecords([noPos], 10)).toEqual([]);
+    });
+});

package/dist/dedup/mdsection/index.js ADDED Viewed

@@ -0,0 +1,4 @@
+export { ReadError, extract, extractFromFile, extractFromFileWithBlocks, } from "./section.js";
+export { blockContentHash, buildEmbedText, classifyNode, collectBlockRecords, countTableDataRows, countWords, collapseWhitespace, extractBlockText, extractInlineText, firstStartLine, linearizeTable, normalizeBlockText, } from "./blocks.js";
+export { AnchorTracker, makeAnchor } from "./anchor.js";
+export { parseMarkdown } from "./parse.js";

package/dist/dedup/mdsection/parse.js ADDED Viewed

@@ -0,0 +1,21 @@
+/**
+ * Shared Markdown parse for the mdsection port.
+ *
+ * The Go code parses with goldmark + the Table extension. We use the unified
+ * stack (remark-parse + remark-gfm), which produces an mdast tree with byte/UTF-16
+ * offsets and 1-indexed line/column positions on every node. The GFM `table`
+ * extension is what mirrors goldmark's `extension.Table`.
+ *
+ * mdast represents a GFM table as a `table` node whose children are all `tableRow`
+ * nodes — the FIRST row is the header (goldmark's KindTableHeader), the rest are
+ * data rows (goldmark's KindTableRow). This shape difference is handled by the
+ * table helpers in blocks.ts.
+ */
+import remarkGfm from "remark-gfm";
+import remarkParse from "remark-parse";
+import { unified } from "unified";
+const processor = unified().use(remarkParse).use(remarkGfm);
+/** Parses Markdown source into an mdast Root with positions. */
+export function parseMarkdown(src) {
+    return processor.parse(src);
+}

package/dist/dedup/mdsection/section.js ADDED Viewed

@@ -0,0 +1,234 @@
+/**
+ * Flat heading-bounded section extraction from Markdown.
+ *
+ * Ported from internal/dedup/mdsection/section.go. Implements the exclusive
+ * content model: each section owns the lines from its heading to the next heading
+ * of ANY level (not just same-or-higher). H1 headings are included and line
+ * numbers are tracked throughout. These IDs/sections persist in the dedup index
+ * and drive clustering, so boundaries and slugs match the Go output exactly.
+ *
+ * Go's Extract takes `[]byte` and slices by byte offset; mdast positions are
+ * line/column over a JS (UTF-16) string. We read files as UTF-8 strings and slice
+ * by line, so the byte-vs-UTF-16 difference never surfaces (line numbers are the
+ * same in both representations).
+ */
+import { createHash } from "node:crypto";
+import { readFileSync } from "node:fs";
+import { derive } from "../sectionid/index.js";
+import { AnchorTracker } from "./anchor.js";
+import { buildEmbedText, classifyNode, collectBlockRecords, countWords, extractInlineText, } from "./blocks.js";
+import { parseMarkdown } from "./parse.js";
+/**
+ * Eligibility threshold: sections with fewer prose words are excluded from the
+ * dedup pool. Matches Default().Markdown.MinProseWords.
+ */
+const minProseWords = 10;
+/**
+ * Thrown when ExtractFromFile / ExtractFromFileWithBlocks cannot read the file.
+ * Go wraps the os.ReadFile error with fmt.Errorf; there is no sentinel, so a
+ * named subclass carries the same context and preserves the underlying cause.
+ */
+export class ReadError extends Error {
+    constructor(op, filePath, cause) {
+        super(`mdsection.${op}: read "${filePath}": ${String(cause)}`);
+        this.name = "ReadError";
+        this.cause = cause;
+    }
+}
+/** Reads filePath from disk and calls extract. */
+export function extractFromFile(filePath) {
+    const src = readSource("ExtractFromFile", filePath);
+    return extract(filePath, src);
+}
+/**
+ * Reads filePath from disk, parses it once, and returns:
+ *   - eligible sections (identical to what extract/extractFromFile returns), and
+ *   - BlockRecords for every heading's body (including ineligible sections such
+ *     as table-only sections), stamped with SectionID, FilePath, and Heading.
+ *
+ * Code and HTML blocks are excluded from BlockRecords. Lists fold to Kind=="prose".
+ */
+export function extractFromFileWithBlocks(filePath) {
+    const src = readSource("ExtractFromFileWithBlocks", filePath);
+    return extractWithBlocks(filePath, src);
+}
+/**
+ * Parses src as Markdown and returns one Section per eligible heading.
+ * filePath is stored verbatim in Section.file_path and used for ID derivation.
+ * All heading levels (including H1) are processed with the exclusive content model.
+ *
+ * Eligibility gate: a section must have prose_word_count >= minProseWords.
+ */
+export function extract(filePath, src) {
+    return extractWithBlocks(filePath, src).sections;
+}
+/** readSource centralizes the disk read so both file entry points share it. */
+function readSource(op, filePath) {
+    try {
+        return readFileSync(filePath, "utf8");
+    }
+    catch (err) {
+        throw new ReadError(op, filePath, err);
+    }
+}
+/**
+ * The single shared parse that both extract and extractFromFileWithBlocks
+ * delegate to. Returns eligible sections and BlockRecords for all headings
+ * (pre-eligibility filter).
+ */
+function extractWithBlocks(filePath, src) {
+    const astDoc = parseMarkdown(src);
+    const tracker = new AnchorTracker();
+    const parsed = collectRawSections(astDoc.children, src, filePath, tracker);
+    // Filter by eligibility gate and populate derived fields.
+    const result = [];
+    for (const p of parsed) {
+        if (!isEligible(p.sec)) {
+            continue;
+        }
+        p.sec.content_hash = createHash("sha256").update(p.sec.embed_text).digest("hex");
+        result.push(p.sec);
+    }
+    // Collect BlockRecords for all sections (pre-eligibility filter).
+    const allBlocks = [];
+    for (const p of parsed) {
+        const recs = collectBlockRecords(p.bodyNodes, p.sec.end_line);
+        for (const rec of recs) {
+            rec.SectionID = p.sec.id;
+            rec.FilePath = filePath;
+            rec.Heading = p.sec.heading;
+        }
+        allBlocks.push(...recs);
+    }
+    return { sections: result, blocks: allBlocks };
+}
+/** True if the section has sufficient prose word count. */
+function isEligible(sec) {
+    return sec.prose_word_count >= minProseWords;
+}
+/**
+ * Walks the top-level AST children and produces one ParsedSection per heading
+ * node, carrying the raw Section (without content_hash) and the body nodes, so
+ * the caller can both filter for eligibility and collect BlockRecords without
+ * re-traversing the AST.
+ */
+function collectRawSections(children, src, filePath, tracker) {
+    const headings = [];
+    const bodyNodes = [];
+    let pendingBody = [];
+    let firstHeading;
+    for (const n of children) {
+        if (n.type === "heading") {
+            if (firstHeading !== undefined) {
+                bodyNodes.push(pendingBody);
+                pendingBody = [];
+            }
+            firstHeading = n;
+            // Go uses n.Lines().At(0).Start → line of the heading text, which is the
+            // heading line. mdast's heading position starts on the same line.
+            const line = n.position ? n.position.start.line : 0;
+            headings.push({ node: n, startLine: line });
+        }
+        else {
+            if (firstHeading !== undefined) {
+                pendingBody.push(n);
+            }
+        }
+    }
+    if (firstHeading !== undefined) {
+        bodyNodes.push(pendingBody);
+    }
+    const totalLines = countFileLines(src);
+    const lines = splitLines(src);
+    const result = [];
+    for (let i = 0; i < headings.length; i++) {
+        const he = headings[i];
+        const heading = headingText(he.node);
+        const anchor = tracker.assign(heading);
+        // EndLine: start of next heading, or EOF+1.
+        const endLine = i + 1 < headings.length ? headings[i + 1].startLine : totalLines + 1;
+        const startLine = he.startLine;
+        // Classify body blocks.
+        const blocks = [];
+        let hasTable = false;
+        let hasCode = false;
+        let proseWordCount = 0;
+        const body = bodyNodes[i];
+        for (const n of body) {
+            const b = classifyNode(n);
+            blocks.push(b);
+            switch (b.class) {
+                case 0 /* BlockClass.Prose */:
+                    proseWordCount += countWords(b.text);
+                    break;
+                case 1 /* BlockClass.Table */:
+                    hasTable = true;
+                    break;
+                case 2 /* BlockClass.Code */:
+                    hasCode = true;
+                    break;
+            }
+        }
+        const rawContent = extractRawContent(lines, startLine, endLine);
+        const embedText = buildEmbedText(blocks);
+        const id = derive(filePath, anchor, heading);
+        const sec = {
+            id,
+            file_path: filePath,
+            heading,
+            heading_level: he.node.depth,
+            anchor,
+            start_line: startLine,
+            end_line: endLine,
+            content_hash: "",
+            raw_content: rawContent,
+            embed_text: embedText,
+            prose_word_count: proseWordCount,
+            has_table: hasTable,
+            has_code: hasCode,
+            inbound_count: 0,
+        };
+        result.push({ sec, bodyNodes: body });
+    }
+    return result;
+}
+/**
+ * Total lines in the file for computing the last section's EndLine.
+ * Mirrors Go's len(strings.Split(strings.TrimRight(string(src), "\n"), "\n")).
+ */
+function countFileLines(src) {
+    return src.replace(/\n+$/, "").split("\n").length;
+}
+/** Extracts plain-text content of a Heading node, trimmed. */
+function headingText(h) {
+    return extractInlineText(h).trim();
+}
+/**
+ * Splits the source into physical lines (no trailing newlines kept).
+ * The last element after split keeps the final partial line; lines are joined
+ * with "\n" when reconstructing a section's raw_content.
+ */
+function splitLines(src) {
+    return src.split("\n");
+}
+/**
+ * Extracts the raw source text for a section given its [startLine, endLine)
+ * range (1-indexed), trimming the trailing newline of the last line.
+ *
+ * Go slices src by byte offset between line starts and TrimRight's "\n". Slicing
+ * by line and rejoining with "\n", then trimming trailing newlines, is the exact
+ * UTF-16-string equivalent.
+ */
+function extractRawContent(lines, startLine, endLine) {
+    if (startLine < 1 || startLine > lines.length) {
+        return "";
+    }
+    // [startLine, endLine) in 1-indexed lines → slice indices [startLine-1, endLine-1).
+    const endIdx = Math.min(endLine - 1, lines.length);
+    const slice = lines.slice(startLine - 1, endIdx);
+    // Go's byte slice spans from the start of startLine to the start of endLine
+    // (i.e. includes the trailing "\n" of the last included line), then TrimRight
+    // strips trailing "\n". Joining with "\n" reproduces the interior newlines;
+    // there is no trailing newline to strip after the join.
+    return slice.join("\n").replace(/\n+$/, "");
+}