npm - docsgov - Versions diffs - 0.1.0 - Mend

docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (159) hide show

package/README.md +242 -0
package/dist/apispec/apispec.js +401 -0
package/dist/apispec/apispec.test.js +444 -0
package/dist/apispec/errors.js +17 -0
package/dist/apispec/index.js +2 -0
package/dist/check/doclinks.js +167 -0
package/dist/check/index.js +8 -0
package/dist/check/run.js +391 -0
package/dist/check/run.test.js +513 -0
package/dist/check/suggest.js +134 -0
package/dist/check/suggest.test.js +92 -0
package/dist/check/tokens.js +125 -0
package/dist/cmd/main.js +330 -0
package/dist/cmd/main.test.js +422 -0
package/dist/codeq/cache.js +71 -0
package/dist/codeq/cache.test.js +67 -0
package/dist/codeq/errors.js +52 -0
package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
package/dist/codeq/index.js +11 -0
package/dist/codeq/resolve.test.js +109 -0
package/dist/codeq/resolver.js +128 -0
package/dist/codeq/resolver.test.js +124 -0
package/dist/codeq/resolvers/go.js +242 -0
package/dist/codeq/resolvers/go.test.js +143 -0
package/dist/codeq/resolvers/java.js +349 -0
package/dist/codeq/resolvers/java.test.js +138 -0
package/dist/codeq/resolvers/java_queries.js +63 -0
package/dist/codeq/resolvers/javascript.js +412 -0
package/dist/codeq/resolvers/javascript.test.js +125 -0
package/dist/codeq/resolvers/javascript_queries.js +46 -0
package/dist/codeq/resolvers/typescript.js +366 -0
package/dist/codeq/resolvers/typescript.test.js +180 -0
package/dist/codeq/resolvers/typescript_queries.js +78 -0
package/dist/codeq/signature.js +50 -0
package/dist/codeq/signature.test.js +50 -0
package/dist/codeq/suggest.js +96 -0
package/dist/codeq/treesitter.js +122 -0
package/dist/codeq/treesitter.test.js +118 -0
package/dist/config/config.js +74 -0
package/dist/config/config.test.js +98 -0
package/dist/config/fs.js +116 -0
package/dist/config/glob.js +82 -0
package/dist/config/glob.test.js +61 -0
package/dist/config/index.js +4 -0
package/dist/dedup/analyzer/analyzer.js +533 -0
package/dist/dedup/analyzer/analyzer.test.js +530 -0
package/dist/dedup/analyzer/canonical.js +74 -0
package/dist/dedup/analyzer/canonical.test.js +70 -0
package/dist/dedup/analyzer/cosine_clusters.js +169 -0
package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
package/dist/dedup/analyzer/distinctive.js +85 -0
package/dist/dedup/analyzer/distinctive.test.js +49 -0
package/dist/dedup/analyzer/exact_clusters.js +63 -0
package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
package/dist/dedup/analyzer/index.js +14 -0
package/dist/dedup/analyzer/multiplicity.js +110 -0
package/dist/dedup/analyzer/multiplicity.test.js +123 -0
package/dist/dedup/analyzer/order.js +22 -0
package/dist/dedup/analyzer/partial_overlaps.js +65 -0
package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
package/dist/dedup/analyzer/preview.js +84 -0
package/dist/dedup/analyzer/preview.test.js +46 -0
package/dist/dedup/analyzer/safety.js +27 -0
package/dist/dedup/analyzer/safety.test.js +39 -0
package/dist/dedup/config.js +18 -0
package/dist/dedup/configload.js +299 -0
package/dist/dedup/configload.test.js +410 -0
package/dist/dedup/dedup.index.test.js +203 -0
package/dist/dedup/dedup.js +143 -0
package/dist/dedup/dedup.test.js +212 -0
package/dist/dedup/dedupcfg/config.js +112 -0
package/dist/dedup/dedupcfg/config.test.js +70 -0
package/dist/dedup/dedupcfg/index.js +1 -0
package/dist/dedup/deduptypes/index.js +1 -0
package/dist/dedup/deduptypes/types.js +9 -0
package/dist/dedup/deduptypes/types.test.js +34 -0
package/dist/dedup/embedder/cache.js +23 -0
package/dist/dedup/embedder/cache.test.js +50 -0
package/dist/dedup/embedder/constants.js +10 -0
package/dist/dedup/embedder/embedder.js +76 -0
package/dist/dedup/embedder/embedder.mock.test.js +128 -0
package/dist/dedup/embedder/embedder.test.js +96 -0
package/dist/dedup/embedder/errors.js +20 -0
package/dist/dedup/embedder/errors.test.js +35 -0
package/dist/dedup/embedder/index.js +4 -0
package/dist/dedup/embedder/session.js +78 -0
package/dist/dedup/embedder/session.test.js +172 -0
package/dist/dedup/gitignore.js +97 -0
package/dist/dedup/gitignore.test.js +98 -0
package/dist/dedup/index.js +11 -0
package/dist/dedup/indexdb/errors.js +48 -0
package/dist/dedup/indexdb/index.js +6 -0
package/dist/dedup/indexdb/indexdb.js +302 -0
package/dist/dedup/indexdb/indexdb.test.js +739 -0
package/dist/dedup/indexdb/load.js +110 -0
package/dist/dedup/indexdb/migrations.js +58 -0
package/dist/dedup/indexdb/schema.js +83 -0
package/dist/dedup/indexer/index.js +9 -0
package/dist/dedup/indexer/indexer.js +501 -0
package/dist/dedup/indexer/indexer.test.js +510 -0
package/dist/dedup/indexer/links.js +89 -0
package/dist/dedup/mdsection/anchor.js +60 -0
package/dist/dedup/mdsection/anchor.test.js +39 -0
package/dist/dedup/mdsection/blocks.js +409 -0
package/dist/dedup/mdsection/blocks.test.js +359 -0
package/dist/dedup/mdsection/index.js +4 -0
package/dist/dedup/mdsection/parse.js +21 -0
package/dist/dedup/mdsection/section.js +234 -0
package/dist/dedup/mdsection/section.test.js +221 -0
package/dist/dedup/report/floatfmt.js +71 -0
package/dist/dedup/report/floatfmt.test.js +42 -0
package/dist/dedup/report/index.js +8 -0
package/dist/dedup/report/quote.js +77 -0
package/dist/dedup/report/quote.test.js +67 -0
package/dist/dedup/report/text.js +251 -0
package/dist/dedup/report/text.test.js +420 -0
package/dist/dedup/report_types.js +8 -0
package/dist/dedup/sectionid/index.js +1 -0
package/dist/dedup/sectionid/sectionid.js +16 -0
package/dist/dedup/sectionid/sectionid.test.js +49 -0
package/dist/guard/api/errors.js +12 -0
package/dist/guard/api/index.js +2 -0
package/dist/guard/api/parser.js +81 -0
package/dist/guard/api/parser.test.js +58 -0
package/dist/guard/api/types.js +1 -0
package/dist/guard/code/errors.js +16 -0
package/dist/guard/code/index.js +2 -0
package/dist/guard/code/parser.js +54 -0
package/dist/guard/code/parser.test.js +111 -0
package/dist/guard/code/types.js +6 -0
package/dist/index.js +1 -0
package/dist/index.test.js +5 -0
package/dist/repo/boundary.js +92 -0
package/dist/repo/boundary.test.js +65 -0
package/dist/repo/errors.js +56 -0
package/dist/repo/errors.test.js +85 -0
package/dist/repo/exists.test.js +72 -0
package/dist/repo/filename.js +46 -0
package/dist/repo/filename.test.js +39 -0
package/dist/repo/fs.js +53 -0
package/dist/repo/index.js +7 -0
package/dist/repo/overlay.js +36 -0
package/dist/repo/overlay.test.js +80 -0
package/dist/repo/repo.js +353 -0
package/dist/repo/repo.test.js +255 -0
package/dist/repo/testutil.js +27 -0
package/dist/repo/write.test.js +125 -0
package/dist/report/color.js +73 -0
package/dist/report/index.js +1 -0
package/dist/report/report.js +112 -0
package/dist/report/report.test.js +368 -0
package/dist/violation/index.js +1 -0
package/dist/violation/types.js +22 -0
package/dist/violation/types.test.js +70 -0
package/package.json +48 -0

package/dist/dedup/indexer/indexer.js ADDED Viewed

@@ -0,0 +1,501 @@
+/**
+ * The two-pass dedup indexing pipeline:
+ *
+ *   - Pass 1: walk docs/, extract sections, embed new/changed ones, upsert rows.
+ *   - Pass 2: scan raw_content for markdown links, update inbound_count on every
+ *     section.
+ *
+ * Both passes run inside one transaction (Store.execTx). If anything fails, the
+ * entire run rolls back — the on-disk DB stays at the previous state.
+ *
+ * Ported from internal/dedup/indexer/{indexer.go,links.go}. Reconciling the
+ * mixed sync/async stack:
+ *   - the embedder is async (await emb.embed) — Go took ctx; the TS embedder
+ *     dropped it, so the Embedder interface here has no ctx parameter;
+ *   - indexdb (node:sqlite) is synchronous — execTx and all queries run without
+ *     await inside the transaction callback;
+ *   - mdsection.extractFromFileWithBlocks is synchronous (readFileSync);
+ *   - the docs/ walk uses node:fs/promises and is async.
+ * So Run is async overall: all embedding (the only async I/O) happens BEFORE the
+ * synchronous execTx, exactly mirroring Go where embed batches run before the one
+ * EXCLUSIVE transaction in applyChanges.
+ *
+ * Go ran the corpus walk + per-file parse in an errgroup of workers. node:sqlite
+ * and mdsection are synchronous and Run holds a single connection, so there is no
+ * concurrency to exploit on the parse side; the walk is a plain async traversal
+ * and results are sorted deterministically afterward, matching Go's post-sort.
+ */
+import * as fsp from "node:fs/promises";
+import * as path from "node:path";
+import { headingBlacklisted } from "../dedupcfg/index.js";
+import { extractFromFileWithBlocks } from "../mdsection/index.js";
+import { parseLinks } from "./links.js";
+/** hasMaxBatch is the TS analogue of Go's `emb.(MaxBatcher)` type assertion. */
+function hasMaxBatch(emb) {
+    return typeof emb.maxBatch === "function";
+}
+/** blockKey is the primary key for a blocks table row, as a "<sid>\x00<idx>" string. */
+function blockKeyOf(sectionID, blockIndex) {
+    return `${sectionID}\u0000${blockIndex}`;
+}
+/**
+ * Run executes the two-pass indexing pipeline over <repoRoot>/docs/.
+ *
+ * db must be open and will have all writes applied inside one transaction. emb is
+ * borrowed — Run never closes it. progress receives one-line status messages;
+ * pass a no-op for silence.
+ *
+ * On error, the transaction is rolled back and the DB is unchanged.
+ */
+export async function run(db, emb, repoRoot, cfg, progress) {
+    // Resolve batch size.
+    const batchSize = resolveBatchSize(emb, cfg);
+    // Pass 1: collect live sections and blocks from the corpus.
+    const { sections, blocks: allBlocks } = await collectSectionsAndBlocks(repoRoot, cfg);
+    // Load existing sections from the DB (for content_hash comparison).
+    const existing = loadExistingSections(db);
+    // Determine which sections need new embeddings.
+    const toEmbed = [];
+    for (const s of sections) {
+        const old = existing.get(s.id);
+        if (old === undefined || old.contentHash !== s.content_hash) {
+            toEmbed.push(s);
+        }
+    }
+    // Embed all new/changed sections; copy existing embeddings for unchanged ones.
+    const embeddings = new Map();
+    for (const s of sections) {
+        const old = existing.get(s.id);
+        if (old !== undefined && old.contentHash === s.content_hash) {
+            embeddings.set(s.id, old.embedding);
+        }
+    }
+    if (toEmbed.length > 0) {
+        progress(`Embedding ${toEmbed.length} sections…\n`);
+        for (let i = 0; i < toEmbed.length; i += batchSize) {
+            const end = Math.min(i + batchSize, toEmbed.length);
+            const batch = toEmbed.slice(i, end);
+            const texts = batch.map((s) => s.embed_text);
+            const vecs = await emb.embed(texts);
+            for (let j = 0; j < batch.length; j++) {
+                embeddings.set(batch[j].id, Float32Array.from(vecs[j]));
+            }
+        }
+    }
+    // Build inbound link counts (Pass 2) — purely in-memory, over the live sections.
+    const inbound = computeInboundCounts(sections, cfg.Indexer.external_url_prefixes);
+    // Build the live set of section IDs.
+    const liveIDs = new Set();
+    for (const s of sections) {
+        liveIDs.add(s.id);
+    }
+    // Determine sections to prune (in DB but not in live set).
+    const toPrune = [];
+    for (const id of existing.keys()) {
+        if (!liveIDs.has(id)) {
+            toPrune.push(id);
+        }
+    }
+    // Load existing blocks from the DB (for content_hash vector reuse and pruning).
+    const { keys: existingBlockKeys, vecs: existingBlockVecs } = loadExistingBlocks(db);
+    // Filter allBlocks to the eligible set.
+    const eligibleBlocks = [];
+    for (const b of allBlocks) {
+        if (blockEligible(b, cfg)) {
+            eligibleBlocks.push(b);
+        }
+    }
+    // Embed unique eligible prose blocks (with hash-based vector reuse).
+    const blockVecs = await embedBlocks(emb, eligibleBlocks, existingBlockVecs, cfg);
+    // Build the live eligible block key set for pruning.
+    const liveBlockKeys = new Set();
+    for (const b of eligibleBlocks) {
+        liveBlockKeys.add(blockKeyOf(b.SectionID, b.Index));
+    }
+    // Determine block keys to prune (in DB but not in live eligible set).
+    const blocksToPrune = [];
+    for (const k of existingBlockKeys) {
+        if (!liveBlockKeys.has(k)) {
+            blocksToPrune.push(k);
+        }
+    }
+    const stats = {
+        sections: sections.length,
+        embedded: toEmbed.length,
+        pruned: toPrune.length,
+    };
+    // Commit all changes inside one transaction.
+    applyChanges(db, sections, embeddings, inbound, toPrune, existing, eligibleBlocks, blockVecs, blocksToPrune, cfg);
+    return stats;
+}
+/** resolveBatchSize mirrors Go's batch-size resolution (cfg default 32, capped by MaxBatcher). */
+function resolveBatchSize(emb, cfg) {
+    let batchSize = cfg.Embedder.batch_size;
+    if (batchSize <= 0) {
+        batchSize = 32;
+    }
+    if (hasMaxBatch(emb)) {
+        const n = emb.maxBatch();
+        if (n > 0 && n < batchSize) {
+            batchSize = n;
+        }
+    }
+    return batchSize;
+}
+/**
+ * loadExistingSections reads all section IDs, content_hashes, and embeddings from
+ * the DB. Returns a map keyed by section ID. The raw embedding BLOB is decoded
+ * here (the SectionDiffRow carries the bytes, not the vector).
+ */
+function loadExistingSections(db) {
+    const rows = db.querySections();
+    const result = new Map();
+    for (const r of rows) {
+        result.set(r.id, {
+            contentHash: r.content_hash,
+            embedding: decodeVec(r.embedding),
+        });
+    }
+    return result;
+}
+/**
+ * loadExistingBlocks reads all block rows and returns:
+ *   - a set of existing block keys (section_id, block_index) for pruning, and
+ *   - a map from content_hash to vector for reuse (NULL-embedding rows are absent
+ *     from the map — tables are exact-hash only).
+ */
+function loadExistingBlocks(db) {
+    const rows = db.queryBlocks();
+    const keys = new Set();
+    const vecs = new Map();
+    for (const r of rows) {
+        keys.add(blockKeyOf(r.section_id, r.block_index));
+        const v = decodeVecOrNull(r.embedding);
+        if (v !== null) {
+            vecs.set(r.content_hash, v);
+        }
+    }
+    return { keys, vecs };
+}
+/**
+ * collectSectionsAndBlocks walks <repoRoot>/docs/ and collects:
+ *   - eligible sections (sorted by file_path, start_line), and
+ *   - BlockRecords for every section in every file (including ineligible sections).
+ *
+ * Each BlockRecord's FilePath is set to the same repo-relative slash path that is
+ * applied to Section.file_path.
+ */
+export async function collectSectionsAndBlocks(repoRoot, cfg) {
+    const docsRoot = path.join(repoRoot, "docs");
+    const mdFiles = await walkMarkdown(docsRoot, cfg);
+    const allSections = [];
+    const allBlocks = [];
+    for (const p of mdFiles) {
+        let extracted;
+        try {
+            extracted = extractFromFileWithBlocks(p);
+        }
+        catch {
+            // Non-fatal: skip unparseable/unreadable files (Go logs silently).
+            continue;
+        }
+        const { sections: secs, blocks: blks } = extracted;
+        // Make file_path relative to repoRoot, normalised to forward slashes.
+        let relPath;
+        try {
+            relPath = path.relative(repoRoot, p);
+        }
+        catch {
+            relPath = p;
+        }
+        relPath = relPath.split(path.sep).join("/");
+        for (const s of secs) {
+            s.file_path = relPath;
+        }
+        for (const b of blks) {
+            b.FilePath = relPath;
+        }
+        if (secs.length === 0 && blks.length === 0) {
+            continue;
+        }
+        allSections.push(...secs);
+        allBlocks.push(...blks);
+    }
+    // Sort sections by (file_path, start_line) for determinism.
+    allSections.sort((a, b) => {
+        if (a.file_path < b.file_path) {
+            return -1;
+        }
+        if (a.file_path > b.file_path) {
+            return 1;
+        }
+        return a.start_line - b.start_line;
+    });
+    return { sections: allSections, blocks: allBlocks };
+}
+/**
+ * walkMarkdown returns every .md file under docsRoot, applying the same dir-skip
+ * rules as Go's filepath.WalkDir producer: skip directories whose name starts
+ * with cfg.Markdown.hidden_dir_prefix or appears in cfg.Markdown.ignored_dirs.
+ *
+ * The returned order is sorted (directory entries are read, then recursed in name
+ * order) — the section list is re-sorted afterward, so this only needs to be a
+ * faithful traversal. A walk error (e.g. docsRoot missing) propagates, matching
+ * Go where the WalkDir error fails the whole Run.
+ */
+async function walkMarkdown(docsRoot, cfg) {
+    const out = [];
+    async function walk(dir) {
+        const entries = await fsp.readdir(dir, { withFileTypes: true });
+        // Sort for deterministic traversal (matches WalkDir's lexical order).
+        entries.sort((a, b) => (a.name < b.name ? -1 : a.name > b.name ? 1 : 0));
+        for (const e of entries) {
+            const full = path.join(dir, e.name);
+            if (e.isDirectory()) {
+                const name = e.name;
+                if (cfg.Markdown.hidden_dir_prefix !== "" && name.startsWith(cfg.Markdown.hidden_dir_prefix)) {
+                    continue;
+                }
+                if (cfg.Markdown.ignored_dirs.includes(name)) {
+                    continue;
+                }
+                await walk(full);
+            }
+            else if (full.endsWith(".md")) {
+                out.push(full);
+            }
+        }
+    }
+    await walk(docsRoot);
+    return out;
+}
+/**
+ * computeInboundCounts builds an in-memory inbound link count map. For each
+ * section, scans raw_content for markdown links; resolves relative paths;
+ * increments the inbound count of the target section. externalPrefixes is
+ * forwarded from cfg.Indexer.external_url_prefixes. The map key is section ID;
+ * value is the inbound link count.
+ */
+export function computeInboundCounts(sections, externalPrefixes) {
+    // Lookup: "<file>\x00<anchor>" → section ID.
+    // For no-anchor links (just a file path), use the earliest, lowest-level
+    // heading in that file as the target (first per file in sorted order).
+    const byAnchor = new Map();
+    const byFile = new Map(); // file → first-section ID
+    for (const s of sections) {
+        byAnchor.set(`${s.file_path}\u0000${s.anchor}`, s.id);
+        // Sections are sorted by (file, start_line), so the first one per file wins.
+        if (!byFile.has(s.file_path)) {
+            byFile.set(s.file_path, s.id);
+        }
+    }
+    const counts = new Map();
+    for (const src of sections) {
+        const links = parseLinks(src.raw_content, src.file_path, externalPrefixes);
+        for (const lnk of links) {
+            let targetID;
+            if (lnk.anchor !== "") {
+                targetID = byAnchor.get(`${lnk.filePath}\u0000${lnk.anchor}`);
+            }
+            else {
+                targetID = byFile.get(lnk.filePath);
+            }
+            // Skip unresolved links and self-links.
+            if (targetID === undefined || targetID === "" || targetID === src.id) {
+                continue;
+            }
+            counts.set(targetID, (counts.get(targetID) ?? 0) + 1);
+        }
+    }
+    return counts;
+}
+/**
+ * applyChanges writes all changes inside a single transaction. It upserts all
+ * live sections with their embeddings and inbound counts, deletes pruned section
+ * IDs, upserts eligible blocks with their embeddings, and prunes stale block keys
+ * — all in the same tx.
+ */
+function applyChanges(db, sections, embeddings, inbound, toPrune, existing, eligibleBlocks, blockVecs, blocksToPrune, cfg) {
+    const now = new Date().toISOString().replace(/\.\d{3}Z$/, "Z");
+    db.execTx((conn) => {
+        const updateInbound = conn.prepare(`UPDATE sections SET inbound_count=?, updated_at=? WHERE id=?`);
+        const upsertSection = conn.prepare(`
+      INSERT OR REPLACE INTO sections
+      (id, file_path, heading, heading_level, anchor, start_line, end_line,
+       content_hash, raw_content, embed_text, prose_word_count,
+       has_table, has_code, inbound_count, embedding, updated_at)
+      VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)`);
+        const deleteSection = conn.prepare(`DELETE FROM sections WHERE id=?`);
+        const upsertBlock = conn.prepare(`
+      INSERT OR REPLACE INTO blocks
+      (section_id, block_index, file_path, heading, kind,
+       start_line, end_line, content_hash, embedding)
+      VALUES (?,?,?,?,?,?,?,?,?)`);
+        const deleteBlock = conn.prepare(`DELETE FROM blocks WHERE section_id=? AND block_index=?`);
+        // Upsert all live sections.
+        for (const s of sections) {
+            const blob = encodeVec(embeddings.get(s.id));
+            const inboundCount = inbound.get(s.id) ?? 0;
+            // Only upsert if new, changed, or inbound count differs.
+            const old = existing.get(s.id);
+            if (old !== undefined && old.contentHash === s.content_hash) {
+                // Just update inbound_count.
+                updateInbound.run(inboundCount, now, s.id);
+                continue;
+            }
+            upsertSection.run(s.id, s.file_path, s.heading, s.heading_level, s.anchor, s.start_line, s.end_line, s.content_hash, s.raw_content, s.embed_text, s.prose_word_count, boolInt(s.has_table), boolInt(s.has_code), inboundCount, blob, now);
+        }
+        // Prune deleted sections.
+        for (const id of toPrune) {
+            deleteSection.run(id);
+        }
+        // Upsert eligible blocks. Embeddable blocks (prose with a non-blacklisted
+        // heading) store the vector; tables and heading-blacklisted prose store NULL,
+        // which keeps them in the exact-hash pass but out of L5-cosine clustering.
+        for (const b of eligibleBlocks) {
+            let blob = null;
+            if (blockEmbeddable(b, cfg)) {
+                blob = encodeVec(blockVecs.get(b.ContentHash));
+            }
+            upsertBlock.run(b.SectionID, b.Index, b.FilePath, b.Heading, b.Kind, b.StartLine, b.EndLine, b.ContentHash, blob);
+        }
+        // Prune stale block keys (in DB but absent from the live eligible set).
+        for (const k of blocksToPrune) {
+            const sep = k.indexOf("\u0000");
+            const sid = k.slice(0, sep);
+            const idx = Number(k.slice(sep + 1));
+            deleteBlock.run(sid, idx);
+        }
+    });
+}
+/**
+ * blockEligible reports whether a block clears the per-block gate.
+ *
+ * Rules (from the L5 design):
+ *   - prose: eligible iff word count >= cfg.Block.min_words.
+ *   - table: eligible iff b.TableRows >= cfg.Block.table_min_rows.
+ *   - anything else: false.
+ *
+ * Go used strings.Fields (split on whitespace, drop empty tokens); the equivalent
+ * here is trim + split on a whitespace run, dropping the empty-string artifact of
+ * a leading/trailing/empty value.
+ */
+export function blockEligible(b, cfg) {
+    switch (b.Kind) {
+        case "prose":
+            return countFields(b.Text) >= cfg.Block.min_words;
+        case "table":
+            return b.TableRows >= cfg.Block.table_min_rows;
+        default:
+            return false;
+    }
+}
+/**
+ * countFields counts whitespace-delimited tokens, matching Go's strings.Fields
+ * (splits on any unicode whitespace run and discards empty tokens).
+ */
+function countFields(s) {
+    const trimmed = s.trim();
+    if (trimmed === "") {
+        return 0;
+    }
+    return trimmed.split(/\s+/).length;
+}
+/**
+ * blockEmbeddable reports whether an eligible block should be embedded (and thus
+ * stored with a vector). Only prose blocks are embedded, and a prose block whose
+ * owning heading matches the heading blacklist is excluded so it never enters the
+ * L5-cosine candidate set — mirroring how tables are exact-hash only.
+ *
+ * A blacklisted block is still eligible (so it is stored and participates in the
+ * exact-hash pass); it just carries a NULL embedding. The gate is applied at
+ * embed time (no wasted embedder call) and at store time (so a previously
+ * embedded block whose heading is later blacklisted loses its vector on re-index).
+ */
+export function blockEmbeddable(b, cfg) {
+    return b.Kind === "prose" && !headingBlacklisted(cfg.Analyzer, b.Heading);
+}
+/**
+ * embedBlocks embeds unique eligible prose blocks, with ContentHash-based
+ * deduplication against the provided existing map.
+ *
+ * Rules:
+ *   - Only prose blocks are embedded; table blocks participate via exact-hash
+ *     only and are absent from the returned map.
+ *   - A ContentHash already present in existing is reused (not re-embedded).
+ *   - Within this run, each ContentHash is embedded at most once.
+ *   - The returned map is keyed by ContentHash → vector and includes both reused
+ *     (from existing) and newly-embedded prose hashes.
+ */
+export async function embedBlocks(emb, eligible, existing, cfg) {
+    const batchSize = resolveBatchSize(emb, cfg);
+    // Start with all existing entries copied into the result.
+    const result = new Map(existing);
+    // Collect unique prose hashes not already present, in order (deterministic batching).
+    const seen = new Set();
+    const toEmbed = [];
+    for (const b of eligible) {
+        if (!blockEmbeddable(b, cfg)) {
+            continue;
+        }
+        if (result.has(b.ContentHash)) {
+            continue; // already in existing
+        }
+        if (seen.has(b.ContentHash)) {
+            continue; // duplicate within this run
+        }
+        seen.add(b.ContentHash);
+        toEmbed.push(b);
+    }
+    // Batch-embed mirroring the section loop.
+    for (let i = 0; i < toEmbed.length; i += batchSize) {
+        const end = Math.min(i + batchSize, toEmbed.length);
+        const batch = toEmbed.slice(i, end);
+        const texts = batch.map((b) => b.Text);
+        const vecs = await emb.embed(texts);
+        for (let j = 0; j < batch.length; j++) {
+            result.set(batch[j].ContentHash, Float32Array.from(vecs[j]));
+        }
+    }
+    return result;
+}
+/**
+ * encodeVec encodes a float32 vector as little-endian bytes. An empty/undefined
+ * vector returns null (the SQLite NULL BLOB), matching Go's encodeVec returning
+ * nil for an empty slice.
+ */
+export function encodeVec(v) {
+    if (v === undefined || v.length === 0) {
+        return null;
+    }
+    const buf = new Uint8Array(v.length * 4);
+    const view = new DataView(buf.buffer);
+    for (let i = 0; i < v.length; i++) {
+        view.setFloat32(i * 4, v[i], true);
+    }
+    return buf;
+}
+/**
+ * decodeVec decodes a little-endian float32 BLOB to a Float32Array. An empty or
+ * NULL blob yields an empty Float32Array (Go returned nil; an empty vector is
+ * treated identically by callers).
+ */
+export function decodeVec(buf) {
+    return decodeVecOrNull(buf) ?? new Float32Array(0);
+}
+/** decodeVecOrNull decodes a BLOB, returning null for an empty/NULL blob. */
+function decodeVecOrNull(buf) {
+    if (buf === null || buf.length === 0) {
+        return null;
+    }
+    const v = new Float32Array(Math.floor(buf.length / 4));
+    const view = new DataView(buf.buffer, buf.byteOffset, buf.byteLength);
+    for (let i = 0; i < v.length; i++) {
+        v[i] = view.getFloat32(i * 4, true);
+    }
+    return v;
+}
+/** boolInt converts a bool to 0 or 1 for SQLite storage. */
+function boolInt(b) {
+    return b ? 1 : 0;
+}