npm - @oscharko-dev/keiko-local-knowledge - Versions diffs - 0.2.0 - Mend

@oscharko-dev/keiko-local-knowledge 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

package/dist/.tsbuildinfo +1 -0
package/dist/bounded-document-extraction.d.ts +27 -0
package/dist/bounded-document-extraction.d.ts.map +1 -0
package/dist/bounded-document-extraction.js +214 -0
package/dist/capsule-lifecycle.d.ts +33 -0
package/dist/capsule-lifecycle.d.ts.map +1 -0
package/dist/capsule-lifecycle.js +292 -0
package/dist/capsule-set-lifecycle.d.ts +15 -0
package/dist/capsule-set-lifecycle.d.ts.map +1 -0
package/dist/capsule-set-lifecycle.js +158 -0
package/dist/chunking/chunker-persist.d.ts +36 -0
package/dist/chunking/chunker-persist.d.ts.map +1 -0
package/dist/chunking/chunker-persist.js +74 -0
package/dist/chunking/chunker-runner.d.ts +9 -0
package/dist/chunking/chunker-runner.d.ts.map +1 -0
package/dist/chunking/chunker-runner.js +218 -0
package/dist/chunking/chunker.d.ts +7 -0
package/dist/chunking/chunker.d.ts.map +1 -0
package/dist/chunking/chunker.js +139 -0
package/dist/chunking/citation-mapper.d.ts +4 -0
package/dist/chunking/citation-mapper.d.ts.map +1 -0
package/dist/chunking/citation-mapper.js +180 -0
package/dist/chunking/index.d.ts +6 -0
package/dist/chunking/index.d.ts.map +1 -0
package/dist/chunking/index.js +8 -0
package/dist/chunking/token-estimator.d.ts +3 -0
package/dist/chunking/token-estimator.d.ts.map +1 -0
package/dist/chunking/token-estimator.js +26 -0
package/dist/chunking/types.d.ts +49 -0
package/dist/chunking/types.d.ts.map +1 -0
package/dist/chunking/types.js +26 -0
package/dist/composition.d.ts +57 -0
package/dist/composition.d.ts.map +1 -0
package/dist/composition.js +310 -0
package/dist/conversation/citation-attacher.d.ts +8 -0
package/dist/conversation/citation-attacher.d.ts.map +1 -0
package/dist/conversation/citation-attacher.js +55 -0
package/dist/conversation/citation-excerpts.d.ts +4 -0
package/dist/conversation/citation-excerpts.d.ts.map +1 -0
package/dist/conversation/citation-excerpts.js +41 -0
package/dist/conversation/grounded-answer-runner.d.ts +9 -0
package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
package/dist/conversation/grounded-answer-runner.js +61 -0
package/dist/conversation/index.d.ts +5 -0
package/dist/conversation/index.d.ts.map +1 -0
package/dist/conversation/index.js +7 -0
package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
package/dist/conversation/model-gateway-answer-generator.js +105 -0
package/dist/conversation/types.d.ts +35 -0
package/dist/conversation/types.d.ts.map +1 -0
package/dist/conversation/types.js +24 -0
package/dist/discovery/discovery-runner.d.ts +23 -0
package/dist/discovery/discovery-runner.d.ts.map +1 -0
package/dist/discovery/discovery-runner.js +109 -0
package/dist/discovery/extract-progressive.d.ts +17 -0
package/dist/discovery/extract-progressive.d.ts.map +1 -0
package/dist/discovery/extract-progressive.js +522 -0
package/dist/discovery/extract.d.ts +26 -0
package/dist/discovery/extract.d.ts.map +1 -0
package/dist/discovery/extract.js +906 -0
package/dist/discovery/glob.d.ts +10 -0
package/dist/discovery/glob.d.ts.map +1 -0
package/dist/discovery/glob.js +72 -0
package/dist/discovery/index.d.ts +6 -0
package/dist/discovery/index.d.ts.map +1 -0
package/dist/discovery/index.js +8 -0
package/dist/discovery/media-type.d.ts +4 -0
package/dist/discovery/media-type.d.ts.map +1 -0
package/dist/discovery/media-type.js +62 -0
package/dist/discovery/persist.d.ts +63 -0
package/dist/discovery/persist.d.ts.map +1 -0
package/dist/discovery/persist.js +345 -0
package/dist/discovery/test-support.d.ts +16 -0
package/dist/discovery/test-support.d.ts.map +1 -0
package/dist/discovery/test-support.js +127 -0
package/dist/discovery/types.d.ts +63 -0
package/dist/discovery/types.d.ts.map +1 -0
package/dist/discovery/types.js +28 -0
package/dist/discovery/walk.d.ts +12 -0
package/dist/discovery/walk.d.ts.map +1 -0
package/dist/discovery/walk.js +302 -0
package/dist/errors.d.ts +13 -0
package/dist/errors.d.ts.map +1 -0
package/dist/errors.js +22 -0
package/dist/evaluations/dimensions.d.ts +14 -0
package/dist/evaluations/dimensions.d.ts.map +1 -0
package/dist/evaluations/dimensions.js +191 -0
package/dist/evaluations/fixtures.d.ts +18 -0
package/dist/evaluations/fixtures.d.ts.map +1 -0
package/dist/evaluations/fixtures.js +858 -0
package/dist/evaluations/index.d.ts +7 -0
package/dist/evaluations/index.d.ts.map +1 -0
package/dist/evaluations/index.js +10 -0
package/dist/evaluations/report.d.ts +3 -0
package/dist/evaluations/report.d.ts.map +1 -0
package/dist/evaluations/report.js +31 -0
package/dist/evaluations/runner-seed.d.ts +12 -0
package/dist/evaluations/runner-seed.d.ts.map +1 -0
package/dist/evaluations/runner-seed.js +175 -0
package/dist/evaluations/runner.d.ts +8 -0
package/dist/evaluations/runner.d.ts.map +1 -0
package/dist/evaluations/runner.js +205 -0
package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
package/dist/evaluations/scripted-embedding-adapter.js +163 -0
package/dist/evaluations/types.d.ts +116 -0
package/dist/evaluations/types.d.ts.map +1 -0
package/dist/evaluations/types.js +27 -0
package/dist/index.d.ts +23 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +41 -0
package/dist/indexing/bounded-indexing.d.ts +41 -0
package/dist/indexing/bounded-indexing.d.ts.map +1 -0
package/dist/indexing/bounded-indexing.js +240 -0
package/dist/indexing/checkpoint-persist.d.ts +8 -0
package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
package/dist/indexing/checkpoint-persist.js +135 -0
package/dist/indexing/checkpoint-resume.d.ts +20 -0
package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
package/dist/indexing/checkpoint-resume.js +50 -0
package/dist/indexing/embedding-batcher.d.ts +3 -0
package/dist/indexing/embedding-batcher.d.ts.map +1 -0
package/dist/indexing/embedding-batcher.js +390 -0
package/dist/indexing/index.d.ts +7 -0
package/dist/indexing/index.d.ts.map +1 -0
package/dist/indexing/index.js +11 -0
package/dist/indexing/job-persist.d.ts +46 -0
package/dist/indexing/job-persist.d.ts.map +1 -0
package/dist/indexing/job-persist.js +157 -0
package/dist/indexing/job-resume.d.ts +4 -0
package/dist/indexing/job-resume.d.ts.map +1 -0
package/dist/indexing/job-resume.js +14 -0
package/dist/indexing/orchestrator.d.ts +3 -0
package/dist/indexing/orchestrator.d.ts.map +1 -0
package/dist/indexing/orchestrator.js +1151 -0
package/dist/indexing/types.d.ts +156 -0
package/dist/indexing/types.d.ts.map +1 -0
package/dist/indexing/types.js +30 -0
package/dist/indexing/vector-persist.d.ts +32 -0
package/dist/indexing/vector-persist.d.ts.map +1 -0
package/dist/indexing/vector-persist.js +105 -0
package/dist/parsers/_internal.d.ts +20 -0
package/dist/parsers/_internal.d.ts.map +1 -0
package/dist/parsers/_internal.js +122 -0
package/dist/parsers/csv-parser.d.ts +3 -0
package/dist/parsers/csv-parser.d.ts.map +1 -0
package/dist/parsers/csv-parser.js +202 -0
package/dist/parsers/docx-parser.d.ts +3 -0
package/dist/parsers/docx-parser.d.ts.map +1 -0
package/dist/parsers/docx-parser.js +390 -0
package/dist/parsers/html-parser.d.ts +3 -0
package/dist/parsers/html-parser.d.ts.map +1 -0
package/dist/parsers/html-parser.js +310 -0
package/dist/parsers/index.d.ts +15 -0
package/dist/parsers/index.d.ts.map +1 -0
package/dist/parsers/index.js +41 -0
package/dist/parsers/json-parser.d.ts +3 -0
package/dist/parsers/json-parser.d.ts.map +1 -0
package/dist/parsers/json-parser.js +192 -0
package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
package/dist/parsers/large-document/capability-discovery.js +76 -0
package/dist/parsers/large-document/diagnostics.d.ts +3 -0
package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
package/dist/parsers/large-document/diagnostics.js +11 -0
package/dist/parsers/large-document/index.d.ts +15 -0
package/dist/parsers/large-document/index.d.ts.map +1 -0
package/dist/parsers/large-document/index.js +10 -0
package/dist/parsers/large-document/legacy-format.d.ts +5 -0
package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
package/dist/parsers/large-document/legacy-format.js +25 -0
package/dist/parsers/large-document/preflight.d.ts +9 -0
package/dist/parsers/large-document/preflight.d.ts.map +1 -0
package/dist/parsers/large-document/preflight.js +43 -0
package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
package/dist/parsers/large-document/progressive-extraction.js +123 -0
package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
package/dist/parsers/large-document/progressive-pdf.js +145 -0
package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
package/dist/parsers/large-document/synthetic-source.js +101 -0
package/dist/parsers/large-document/window-builder.d.ts +24 -0
package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
package/dist/parsers/large-document/window-builder.js +75 -0
package/dist/parsers/ocr/index.d.ts +4 -0
package/dist/parsers/ocr/index.d.ts.map +1 -0
package/dist/parsers/ocr/index.js +4 -0
package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
package/dist/parsers/ocr/types.d.ts +16 -0
package/dist/parsers/ocr/types.d.ts.map +1 -0
package/dist/parsers/ocr/types.js +4 -0
package/dist/parsers/parser-test-fixtures.d.ts +28 -0
package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
package/dist/parsers/parser-test-fixtures.js +139 -0
package/dist/parsers/pdf-parser.d.ts +43 -0
package/dist/parsers/pdf-parser.d.ts.map +1 -0
package/dist/parsers/pdf-parser.js +388 -0
package/dist/parsers/registry.d.ts +8 -0
package/dist/parsers/registry.d.ts.map +1 -0
package/dist/parsers/registry.js +57 -0
package/dist/parsers/text-parser.d.ts +3 -0
package/dist/parsers/text-parser.d.ts.map +1 -0
package/dist/parsers/text-parser.js +214 -0
package/dist/parsers/types.d.ts +53 -0
package/dist/parsers/types.d.ts.map +1 -0
package/dist/parsers/types.js +21 -0
package/dist/parsers/unsupported-parser.d.ts +4 -0
package/dist/parsers/unsupported-parser.d.ts.map +1 -0
package/dist/parsers/unsupported-parser.js +97 -0
package/dist/parsers/xlsx-parser.d.ts +3 -0
package/dist/parsers/xlsx-parser.d.ts.map +1 -0
package/dist/parsers/xlsx-parser.js +425 -0
package/dist/privacy/audit-emitter.d.ts +5 -0
package/dist/privacy/audit-emitter.d.ts.map +1 -0
package/dist/privacy/audit-emitter.js +93 -0
package/dist/privacy/diagnostic-redactor.d.ts +2 -0
package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
package/dist/privacy/diagnostic-redactor.js +153 -0
package/dist/privacy/index.d.ts +5 -0
package/dist/privacy/index.d.ts.map +1 -0
package/dist/privacy/index.js +6 -0
package/dist/privacy/retention-applier.d.ts +5 -0
package/dist/privacy/retention-applier.d.ts.map +1 -0
package/dist/privacy/retention-applier.js +88 -0
package/dist/privacy/types.d.ts +98 -0
package/dist/privacy/types.d.ts.map +1 -0
package/dist/privacy/types.js +12 -0
package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
package/dist/qualityIntelligence/index.d.ts +3 -0
package/dist/qualityIntelligence/index.d.ts.map +1 -0
package/dist/qualityIntelligence/index.js +5 -0
package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
package/dist/qualityIntelligence/qiHandoff.js +82 -0
package/dist/retrieval/answer-grounding.d.ts +9 -0
package/dist/retrieval/answer-grounding.d.ts.map +1 -0
package/dist/retrieval/answer-grounding.js +31 -0
package/dist/retrieval/context-pack-assembler.d.ts +24 -0
package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
package/dist/retrieval/context-pack-assembler.js +50 -0
package/dist/retrieval/index.d.ts +6 -0
package/dist/retrieval/index.d.ts.map +1 -0
package/dist/retrieval/index.js +9 -0
package/dist/retrieval/retrieval-runner.d.ts +10 -0
package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
package/dist/retrieval/retrieval-runner.js +163 -0
package/dist/retrieval/scoped-vector-search.d.ts +24 -0
package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
package/dist/retrieval/scoped-vector-search.js +864 -0
package/dist/retrieval/types.d.ts +28 -0
package/dist/retrieval/types.d.ts.map +1 -0
package/dist/retrieval/types.js +33 -0
package/dist/section-path-hash.d.ts +3 -0
package/dist/section-path-hash.d.ts.map +1 -0
package/dist/section-path-hash.js +9 -0
package/dist/source-lifecycle.d.ts +14 -0
package/dist/source-lifecycle.d.ts.map +1 -0
package/dist/source-lifecycle.js +155 -0
package/dist/source-routing-validation.d.ts +11 -0
package/dist/source-routing-validation.d.ts.map +1 -0
package/dist/source-routing-validation.js +140 -0
package/dist/store-content-cipher.d.ts +11 -0
package/dist/store-content-cipher.d.ts.map +1 -0
package/dist/store-content-cipher.js +67 -0
package/dist/store-content-encryption.d.ts +12 -0
package/dist/store-content-encryption.d.ts.map +1 -0
package/dist/store-content-encryption.js +275 -0
package/dist/store-paths.d.ts +6 -0
package/dist/store-paths.d.ts.map +1 -0
package/dist/store-paths.js +61 -0
package/dist/store.d.ts +30 -0
package/dist/store.d.ts.map +1 -0
package/dist/store.js +219 -0
package/dist/testing.d.ts +47 -0
package/dist/testing.d.ts.map +1 -0
package/dist/testing.js +170 -0
package/dist/version.d.ts +2 -0
package/dist/version.d.ts.map +1 -0
package/dist/version.js +4 -0
package/package.json +43 -0

package/dist/chunking/chunker-runner.js ADDED Viewed

@@ -0,0 +1,218 @@
+// Per-document chunker orchestrator (Epic #189, Issue #195).
+//
+// Reads parsed_units for the document, runs the pure `chunkParsedUnit` per unit, and
+// persists chunks inside a single transaction so a mid-document failure (or AbortSignal
+// cancellation) rolls back ALL chunks for the document — never half-chunked state.
+//
+// Idempotency: with `force: false` (default) and existing chunks already in the table,
+// the runner is a no-op and returns `skippedExisting: true`. With `force: true`, prior
+// chunks are deleted at the start of the transaction.
+import { chunkParsedUnit, chunkingStrategyKey, resolveChunkingOptions } from "./chunker.js";
+import { countChunksForDocument, deleteChunksForDocument, hasStaleChunksForDocument, insertChunkRow, selectDocumentSourceId, selectParsedUnitsForDocument, } from "./chunker-persist.js";
+import { ChunkingError } from "./types.js";
+// ─── Row → ParsedUnit reconstitution ──────────────────────────────────────────
+// The parsed_units table is the canonical write surface for #194. We re-hydrate the
+// discriminant union here so the pure chunker stays unaware of SQLite. Defensive: any
+// row with a missing field for its kind raises a ChunkingError rather than producing a
+// partially-typed value that crashes the slicer later.
+function expectNumber(value, field, unitId) {
+    if (value === null) {
+        throw new ChunkingError(`parsed_unit ${unitId} is missing required field ${field}`);
+    }
+    return value;
+}
+function parseStringArrayField(raw, field, unitId, cipher) {
+    if (raw === null) {
+        throw new ChunkingError(`parsed_unit ${unitId} is missing required field ${field}`);
+    }
+    const parsed = JSON.parse(cipher.openText(raw));
+    if (!Array.isArray(parsed) || !parsed.every((entry) => typeof entry === "string")) {
+        throw new ChunkingError(`parsed_unit ${unitId} field ${field} did not deserialise to string[]`);
+    }
+    return parsed;
+}
+function rowToPageUnit(row, documentId) {
+    return {
+        kind: "page",
+        documentId,
+        pageNumber: expectNumber(row.page_number, "page_number", row.id),
+        ...(row.page_label !== null ? { pageLabel: row.page_label } : {}),
+        characterStart: expectNumber(row.character_start, "character_start", row.id),
+        characterEnd: expectNumber(row.character_end, "character_end", row.id),
+    };
+}
+function rowToSectionUnit(row, documentId, cipher) {
+    return {
+        kind: "section",
+        documentId,
+        sectionPath: parseStringArrayField(row.section_path_json, "section_path_json", row.id, cipher),
+        characterStart: expectNumber(row.character_start, "character_start", row.id),
+        characterEnd: expectNumber(row.character_end, "character_end", row.id),
+    };
+}
+function rowToJsonPathUnit(row, documentId) {
+    if (row.json_pointer === null) {
+        throw new ChunkingError(`parsed_unit ${row.id} missing json_pointer`);
+    }
+    return {
+        kind: "json-path",
+        documentId,
+        jsonPointer: row.json_pointer,
+        characterStart: expectNumber(row.character_start, "character_start", row.id),
+        characterEnd: expectNumber(row.character_end, "character_end", row.id),
+    };
+}
+function rowToCsvRowUnit(row, documentId) {
+    if (row.table_name === null) {
+        throw new ChunkingError(`parsed_unit ${row.id} missing table_name`);
+    }
+    return {
+        kind: "csv-row",
+        documentId,
+        tableName: row.table_name,
+        rowIndex: expectNumber(row.row_index, "row_index", row.id),
+        characterStart: expectNumber(row.character_start, "character_start", row.id),
+        characterEnd: expectNumber(row.character_end, "character_end", row.id),
+    };
+}
+function rowToHtmlBlockUnit(row, documentId, cipher) {
+    const heading = row.heading_path_json === null
+        ? undefined
+        : parseStringArrayField(row.heading_path_json, "heading_path_json", row.id, cipher);
+    return {
+        kind: "html-block",
+        documentId,
+        ...(heading !== undefined ? { headingPath: heading } : {}),
+        characterStart: expectNumber(row.character_start, "character_start", row.id),
+        characterEnd: expectNumber(row.character_end, "character_end", row.id),
+    };
+}
+export function rowToParsedUnit(row, documentId, cipher) {
+    switch (row.kind) {
+        case "page":
+            return rowToPageUnit(row, documentId);
+        case "section":
+            return rowToSectionUnit(row, documentId, cipher);
+        case "json-path":
+            return rowToJsonPathUnit(row, documentId);
+        case "csv-row":
+            return rowToCsvRowUnit(row, documentId);
+        case "html-block":
+            return rowToHtmlBlockUnit(row, documentId, cipher);
+        case "unsupported-media":
+            return {
+                kind: "unsupported-media",
+                documentId,
+                reason: row.unsupported_reason ?? "unknown",
+            };
+        default:
+            throw new ChunkingError(`parsed_unit ${row.id} has unknown kind ${row.kind}`);
+    }
+}
+// ─── Cancellation helper ─────────────────────────────────────────────────────
+function throwIfAborted(signal) {
+    if (signal?.aborted === true) {
+        throw new ChunkingError("chunkDocument aborted via AbortSignal");
+    }
+}
+// ─── ID composition ──────────────────────────────────────────────────────────
+// Chunk IDs are deterministic on (documentId, parsedUnitRowId, orderIndex). Using a
+// composite scheme — rather than UUIDs — keeps the chunks table re-runnable: a
+// re-chunk with force=true reproduces byte-identical row IDs, which makes the audit /
+// evidence-manifest layer's row-equality assertions hold across runs.
+export function composeChunkId(documentId, parsedUnitRowId, orderIndex) {
+    return `${String(documentId)}#${parsedUnitRowId}#c${String(orderIndex)}`;
+}
+function documentMaxChunks(options) {
+    return resolveChunkingOptions(options).maxChunks;
+}
+function optionsWithRemainingChunkBudget(options, remaining) {
+    return options === undefined ? { maxChunks: remaining } : { ...options, maxChunks: remaining };
+}
+function persistAllChunks(store, ctx, rows, options, signal) {
+    const db = store._internal.db;
+    const chunkIds = [];
+    const maxChunks = documentMaxChunks(options);
+    const strategyKey = chunkingStrategyKey(options);
+    let orderIndex = 0;
+    for (const row of rows) {
+        throwIfAborted(signal);
+        const remaining = maxChunks - chunkIds.length;
+        if (remaining <= 0) {
+            throw new ChunkingError(`chunkDocument exceeded maxChunks ${String(maxChunks)}`);
+        }
+        const unit = rowToParsedUnit(row, ctx.documentId, store._internal.contentCipher);
+        const chunks = chunkParsedUnit(unit, ctx.sourceText, optionsWithRemainingChunkBudget(options, remaining));
+        for (const chunk of chunks) {
+            const id = composeChunkId(ctx.documentId, row.id, orderIndex);
+            insertChunkRow(db, {
+                id,
+                capsuleId: ctx.capsuleId,
+                sourceId: ctx.sourceId,
+                documentId: ctx.documentId,
+                parsedUnitId: row.id,
+                orderIndex,
+                tokenCount: chunk.tokenCount,
+                safeExcerptHash: chunk.safeExcerptHash,
+                chunkingStrategyVersion: strategyKey,
+                characterStart: chunk.characterStart,
+                characterEnd: chunk.characterEnd,
+            });
+            chunkIds.push(id);
+            orderIndex += 1;
+        }
+    }
+    return chunkIds;
+}
+function loadChunkingPreflight(store, capsuleId, documentId, options) {
+    const db = store._internal.db;
+    const existingCount = countChunksForDocument(db, capsuleId, documentId);
+    return {
+        existingCount,
+        staleChunks: existingCount > 0 &&
+            hasStaleChunksForDocument(db, capsuleId, documentId, chunkingStrategyKey(options)),
+    };
+}
+function assertDocumentSourceMatches(store, capsuleId, documentId, sourceId) {
+    const documentSourceId = selectDocumentSourceId(store._internal.db, capsuleId, documentId);
+    if (documentSourceId !== undefined && String(documentSourceId) !== String(sourceId)) {
+        throw new ChunkingError(`chunkDocument sourceId ${String(sourceId)} does not match document ${String(documentId)} source ${String(documentSourceId)}`);
+    }
+}
+function shouldReuseExistingChunks(preflight, force) {
+    return preflight.existingCount > 0 && force !== true && !preflight.staleChunks;
+}
+function shouldDeleteExistingChunks(preflight, force) {
+    return (force === true || preflight.staleChunks) && preflight.existingCount > 0;
+}
+export function chunkDocument(store, params, options) {
+    const { capsuleId, sourceId, documentId, sourceText, force, signal } = params;
+    throwIfAborted(signal);
+    const db = store._internal.db;
+    const preflight = loadChunkingPreflight(store, capsuleId, documentId, options);
+    assertDocumentSourceMatches(store, capsuleId, documentId, sourceId);
+    if (shouldReuseExistingChunks(preflight, force)) {
+        return { capsuleId, documentId, chunkIds: [], skippedExisting: true };
+    }
+    const rows = selectParsedUnitsForDocument(db, capsuleId, documentId);
+    if (rows.length === 0) {
+        return { capsuleId, documentId, chunkIds: [], skippedExisting: false };
+    }
+    db.exec("BEGIN");
+    try {
+        if (shouldDeleteExistingChunks(preflight, force)) {
+            deleteChunksForDocument(db, capsuleId, documentId);
+        }
+        const ctx = { capsuleId, sourceId, documentId, sourceText };
+        const chunkIds = persistAllChunks(store, ctx, rows, options, signal);
+        throwIfAborted(signal);
+        db.exec("COMMIT");
+        return { capsuleId, documentId, chunkIds, skippedExisting: false };
+    }
+    catch (cause) {
+        db.exec("ROLLBACK");
+        if (cause instanceof ChunkingError)
+            throw cause;
+        throw new ChunkingError(`chunkDocument failed for document ${String(documentId)}`, cause === undefined ? undefined : { cause });
+    }
+}

package/dist/chunking/chunker.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import type { ParsedUnit } from "@oscharko-dev/keiko-contracts";
+import type { ChunkingOptions, ChunkingResult, ResolvedChunkingOptions } from "./types.js";
+export declare function resolveChunkingOptions(options: ChunkingOptions | undefined): ResolvedChunkingOptions;
+export declare function chunkingStrategyKey(options: ChunkingOptions | undefined): string;
+export declare function chunkDedupeKey(text: string): string | undefined;
+export declare function chunkParsedUnit(unit: ParsedUnit, sourceText: string, options?: ChunkingOptions): readonly ChunkingResult[];
+//# sourceMappingURL=chunker.d.ts.map

package/dist/chunking/chunker.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../../src/chunking/chunker.ts"],"names":[],"mappings":"AAuBA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,+BAA+B,CAAC;AAGhE,OAAO,KAAK,EACV,eAAe,EACf,cAAc,EACd,uBAAuB,EAExB,MAAM,YAAY,CAAC;AAiCpB,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,eAAe,GAAG,SAAS,GACnC,uBAAuB,CAgBzB;AAED,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,eAAe,GAAG,SAAS,GAAG,MAAM,CAWhF;AAuBD,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAK/D;AAuDD,wBAAgB,eAAe,CAC7B,IAAI,EAAE,UAAU,EAChB,UAAU,EAAE,MAAM,EAClB,OAAO,CAAC,EAAE,eAAe,GACxB,SAAS,cAAc,EAAE,CAyB3B"}

package/dist/chunking/chunker.js ADDED Viewed

@@ -0,0 +1,139 @@
+// Pure chunking function (Epic #189, Issue #195).
+//
+// Given a parsed unit + the document's full source text + chunking options, produce one
+// or more `ChunkingResult` slices. The function is deliberately pure (no IO, no clock, no
+// hashing of external state) so it can be unit-tested without a SQLite store.
+//
+// Algorithm:
+//   1. Resolve the unit's character span. For unit kinds that carry `characterStart/end`
+//      (page/section/json-path/csv-row/html-block), slice the source text by those
+//      offsets. For unsupported-media units (no offsets), emit nothing — these units
+//      are tracked for diagnostics, not for retrieval.
+//   2. If the slice's estimated tokens < minTokens, emit a single chunk over the entire
+//      slice — never drop content.
+//   3. Otherwise walk forward by `maxChars - overlapChars` per step, emitting chunks of
+//      length `maxChars`. The last chunk includes whatever trailing text remains, even
+//      if it is shorter than minTokens — never drop content.
+//   4. Hostile fallback: when no whitespace appears inside the maxChars window (single
+//      very long line), the algorithm still produces chunks because slicing is purely
+//      character-bounded. Token boundaries become advisory, not authoritative; that
+//      tradeoff is intentional and documented.
+import { createHash } from "node:crypto";
+import { charsForTokenBudget } from "./token-estimator.js";
+import { ChunkingError, DEFAULT_CHUNKING_STRATEGY_KEY, DEFAULT_MAX_CHUNKS, DEFAULT_MAX_TOKENS, DEFAULT_MIN_TOKENS, DEFAULT_OVERLAP_TOKENS, MAX_CHUNK_TOKENS, MAX_OVERLAP_TOKENS, CHUNKING_STRATEGY_VERSION, } from "./types.js";
+import { defaultTokenEstimator } from "./token-estimator.js";
+const WHITESPACE_PATTERN = /\s+/gu;
+const INFORMATIVE_CHARACTER_PATTERN = /[\p{L}\p{N}]/u;
+function positiveInteger(raw, fallback, field) {
+    const value = raw ?? fallback;
+    if (!Number.isFinite(value) || value < 1) {
+        throw new ChunkingError(`${field} must be a positive finite integer`);
+    }
+    return Math.floor(value);
+}
+function nonNegativeInteger(raw, fallback, field) {
+    const value = raw ?? fallback;
+    if (!Number.isFinite(value) || value < 0) {
+        throw new ChunkingError(`${field} must be a non-negative finite integer`);
+    }
+    return Math.floor(value);
+}
+export function resolveChunkingOptions(options) {
+    const maxTokens = Math.min(positiveInteger(options?.maxTokens, DEFAULT_MAX_TOKENS, "maxTokens"), MAX_CHUNK_TOKENS);
+    const minTokens = nonNegativeInteger(options?.minTokens, DEFAULT_MIN_TOKENS, "minTokens");
+    const overlapTokens = Math.min(nonNegativeInteger(options?.overlapTokens, DEFAULT_OVERLAP_TOKENS, "overlapTokens"), MAX_OVERLAP_TOKENS);
+    const maxChunks = Math.min(positiveInteger(options?.maxChunks, DEFAULT_MAX_CHUNKS, "maxChunks"), DEFAULT_MAX_CHUNKS);
+    const tokenEstimator = options?.tokenEstimator ?? defaultTokenEstimator;
+    return { maxTokens, minTokens, overlapTokens, maxChunks, tokenEstimator };
+}
+export function chunkingStrategyKey(options) {
+    if (options === undefined)
+        return DEFAULT_CHUNKING_STRATEGY_KEY;
+    const resolved = resolveChunkingOptions(options);
+    return [
+        CHUNKING_STRATEGY_VERSION,
+        `max=${String(resolved.maxTokens)}`,
+        `min=${String(resolved.minTokens)}`,
+        `overlap=${String(resolved.overlapTokens)}`,
+        `limit=${String(resolved.maxChunks)}`,
+        options.tokenEstimator === undefined ? "estimator=default" : "estimator=custom",
+    ].join("|");
+}
+function spanForUnit(unit, sourceLength) {
+    if (unit.kind === "unsupported-media")
+        return undefined;
+    const start = Math.max(0, Math.min(unit.characterStart, sourceLength));
+    const end = Math.max(start, Math.min(unit.characterEnd, sourceLength));
+    if (end <= start)
+        return undefined;
+    return { start, end };
+}
+function hashExcerpt(text) {
+    return createHash("sha256").update(text, "utf8").digest("hex");
+}
+function normaliseChunkText(text) {
+    return text.normalize("NFKC").replace(WHITESPACE_PATTERN, " ").trim();
+}
+export function chunkDedupeKey(text) {
+    const normalised = normaliseChunkText(text);
+    if (normalised.length === 0)
+        return undefined;
+    if (!INFORMATIVE_CHARACTER_PATTERN.test(normalised))
+        return undefined;
+    return hashExcerpt(normalised);
+}
+function buildChunk(sourceText, start, end, estimator) {
+    const excerpt = sourceText.slice(start, end);
+    if (chunkDedupeKey(excerpt) === undefined)
+        return undefined;
+    return {
+        characterStart: start,
+        characterEnd: end,
+        tokenCount: estimator(excerpt),
+        safeExcerptHash: hashExcerpt(excerpt),
+    };
+}
+function computeStepSizes(resolved) {
+    const maxChars = Math.max(1, charsForTokenBudget(resolved.maxTokens));
+    // Clamp overlap to [0, maxChars-1] so stride is always at least 1 — otherwise an
+    // overlap >= maxChars would produce an infinite loop.
+    const overlapChars = Math.max(0, Math.min(charsForTokenBudget(resolved.overlapTokens), maxChars - 1));
+    const stride = maxChars - overlapChars;
+    return { maxChars, overlapChars, stride };
+}
+function shouldEmitSingleChunk(excerpt, resolved) {
+    // The unit fits in one chunk when its estimated token count does not exceed maxTokens.
+    // The `minTokens` lower bound is a *floor* on chunk size, not a gate — a tiny unit still
+    // produces one chunk so we never drop content (spec edge case: "Single tiny unit").
+    return resolved.tokenEstimator(excerpt) <= resolved.maxTokens;
+}
+function pushChunk(chunks, chunk, maxChunks) {
+    if (chunk === undefined)
+        return;
+    if (chunks.length >= maxChunks) {
+        throw new ChunkingError(`chunkParsedUnit exceeded maxChunks ${String(maxChunks)}`);
+    }
+    chunks.push(chunk);
+}
+export function chunkParsedUnit(unit, sourceText, options) {
+    const resolved = resolveChunkingOptions(options);
+    const span = spanForUnit(unit, sourceText.length);
+    if (span === undefined)
+        return [];
+    const excerpt = sourceText.slice(span.start, span.end);
+    if (shouldEmitSingleChunk(excerpt, resolved)) {
+        const chunk = buildChunk(sourceText, span.start, span.end, resolved.tokenEstimator);
+        return chunk === undefined ? [] : [chunk];
+    }
+    const { maxChars, stride } = computeStepSizes(resolved);
+    const chunks = [];
+    let cursor = span.start;
+    while (cursor < span.end) {
+        const end = Math.min(cursor + maxChars, span.end);
+        pushChunk(chunks, buildChunk(sourceText, cursor, end, resolved.tokenEstimator), resolved.maxChunks);
+        if (end >= span.end)
+            break;
+        cursor += stride;
+    }
+    return chunks;
+}

package/dist/chunking/citation-mapper.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+import type { CapsuleSetId, ChunkId, CitationReference, KnowledgeCapsuleId } from "@oscharko-dev/keiko-contracts";
+import type { KnowledgeStore } from "../store.js";
+export declare function mapChunkToCitation(store: KnowledgeStore, capsuleId: KnowledgeCapsuleId, chunkId: ChunkId, _capsuleSetId?: CapsuleSetId): CitationReference | null;
+//# sourceMappingURL=citation-mapper.d.ts.map

package/dist/chunking/citation-mapper.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"citation-mapper.d.ts","sourceRoot":"","sources":["../../src/chunking/citation-mapper.ts"],"names":[],"mappings":"AAuBA,OAAO,KAAK,EACV,YAAY,EACZ,OAAO,EACP,iBAAiB,EAEjB,kBAAkB,EAEnB,MAAM,+BAA+B,CAAC;AAGvC,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAiPlD,wBAAgB,kBAAkB,CAChC,KAAK,EAAE,cAAc,EACrB,SAAS,EAAE,kBAAkB,EAC7B,OAAO,EAAE,OAAO,EAChB,aAAa,CAAC,EAAE,YAAY,GAC3B,iBAAiB,GAAG,IAAI,CAc1B"}

package/dist/chunking/citation-mapper.js ADDED Viewed

@@ -0,0 +1,180 @@
+// Citation hop (Epic #189, Issue #195).
+//
+// Given a (capsuleId, chunkId), produce a `CitationReference` by walking
+// chunk → parsed_unit → document → page/section. The function is read-only and pure
+// with respect to the store: it never mutates rows.
+//
+// Hop strategy:
+//   1. Look up the chunk row. Returns null when the chunk is absent — distinct from
+//      throwing, because retrieval callers (#199) treat missing-chunk as "stale index
+//      pointer" and recover by re-running chunking, not by surfacing an error.
+//   2. Look up its parsed_unit row in the same capsule scope.
+//   3. From the parsed_unit's kind, hop:
+//        - kind=page: copy pageNumber/pageLabel + characterStart/End directly.
+//        - kind=section: copy sectionPath + characterStart/End directly.
+//        - kind=html-block: copy headingPath + characterStart/End.
+//        - kind=json-path: copy jsonPointer + characterStart/End.
+//        - kind=csv-row: copy tableName/rowIndex + characterStart/End.
+//        - other kinds (unsupported-media): characterStart/End only.
+//      THEN: if the parsed_unit's span overlaps any persisted `pages` row, attach that
+//      page's pageNumber/pageLabel — section units inside a paged document still
+//      surface a citation page number.
+//   4. Document row provides safeDisplayName + sourceId.
+const SELECT_CHUNK_SQL = "SELECT id, capsule_id, source_id, document_id, parsed_unit_id, character_start, character_end FROM chunks WHERE capsule_id = :c AND id = :id";
+const SELECT_PARSED_UNIT_SQL = [
+    "SELECT kind, page_number, page_label, section_path_json,",
+    "  heading_path_json, json_pointer, table_name, row_index, character_start, character_end",
+    "FROM parsed_units",
+    "WHERE capsule_id = :c AND id = :id",
+].join(" ");
+const SELECT_DOCUMENT_SQL = "SELECT source_id, safe_display_name FROM documents WHERE capsule_id = :c AND id = :id";
+// Page-hop query: find a page row that contains the parsed_unit's character span.
+// Used to attach a page number to non-page units (e.g. sections / html-blocks inside a
+// paged document). Limit 1 — citations point at the first containing page.
+const SELECT_PAGE_FOR_RANGE_SQL = [
+    "SELECT page_number, page_label FROM pages",
+    "WHERE capsule_id = :c AND document_id = :d",
+    "  AND character_start <= :s AND character_end >= :e",
+    "ORDER BY page_number ASC LIMIT 1",
+].join(" ");
+function fetchChunkRow(db, capsuleId, chunkId) {
+    const row = db.prepare(SELECT_CHUNK_SQL).get({ c: capsuleId, id: String(chunkId) });
+    return row === undefined ? undefined : row;
+}
+function fetchParsedUnitRow(db, capsuleId, parsedUnitId) {
+    const row = db.prepare(SELECT_PARSED_UNIT_SQL).get({ c: capsuleId, id: parsedUnitId });
+    return row === undefined ? undefined : row;
+}
+function fetchDocumentRow(db, capsuleId, documentId) {
+    const row = db.prepare(SELECT_DOCUMENT_SQL).get({ c: capsuleId, id: String(documentId) });
+    return row === undefined ? undefined : row;
+}
+function fetchPageForRange(db, capsuleId, documentId, characterStart, characterEnd) {
+    const row = db
+        .prepare(SELECT_PAGE_FOR_RANGE_SQL)
+        .get({ c: capsuleId, d: String(documentId), s: characterStart, e: characterEnd });
+    return row === undefined ? undefined : row;
+}
+function parseStringArray(raw, cipher) {
+    if (raw === null)
+        return undefined;
+    const parsed = JSON.parse(cipher.openText(raw));
+    if (!Array.isArray(parsed) || !parsed.every((entry) => typeof entry === "string")) {
+        return undefined;
+    }
+    return parsed;
+}
+function baseHopFields(unit) {
+    return {
+        pageNumber: undefined,
+        pageLabel: undefined,
+        sectionPath: undefined,
+        jsonPointer: undefined,
+        tableName: undefined,
+        rowIndex: undefined,
+        characterStart: unit.character_start ?? undefined,
+        characterEnd: unit.character_end ?? undefined,
+    };
+}
+const HOP_FIELDS_BY_KIND = new Map([
+    [
+        "page",
+        (unit, base) => ({
+            ...base,
+            pageNumber: unit.page_number ?? undefined,
+            pageLabel: unit.page_label ?? undefined,
+        }),
+    ],
+    [
+        "section",
+        (unit, base, cipher) => ({
+            ...base,
+            sectionPath: parseStringArray(unit.section_path_json, cipher),
+        }),
+    ],
+    [
+        "html-block",
+        (unit, base, cipher) => ({
+            ...base,
+            sectionPath: parseStringArray(unit.heading_path_json, cipher),
+        }),
+    ],
+    [
+        "json-path",
+        (unit, base) => ({
+            ...base,
+            jsonPointer: unit.json_pointer ?? undefined,
+        }),
+    ],
+    [
+        "csv-row",
+        (unit, base) => ({
+            ...base,
+            tableName: unit.table_name ?? undefined,
+            rowIndex: unit.row_index ?? undefined,
+        }),
+    ],
+]);
+function hopFieldsForUnit(unit, cipher) {
+    const base = baseHopFields(unit);
+    return HOP_FIELDS_BY_KIND.get(unit.kind)?.(unit, base, cipher) ?? base;
+}
+function applyChunkSpan(hop, chunk) {
+    return {
+        ...hop,
+        characterStart: chunk.character_start ?? hop.characterStart,
+        characterEnd: chunk.character_end ?? hop.characterEnd,
+    };
+}
+function attachPageHop(db, capsuleId, documentId, hop) {
+    if (hop.pageNumber !== undefined)
+        return hop;
+    if (hop.characterStart === undefined || hop.characterEnd === undefined)
+        return hop;
+    const page = fetchPageForRange(db, capsuleId, documentId, hop.characterStart, hop.characterEnd);
+    if (page === undefined)
+        return hop;
+    return {
+        ...hop,
+        pageNumber: page.page_number,
+        pageLabel: page.page_label ?? undefined,
+    };
+}
+// Builds an `exactOptionalPropertyTypes`-friendly CitationReference: optional fields are
+// only present when defined. Spreading conditional objects keeps tsc happy under that
+// strict option.
+function buildCitation(chunk, document, hop, chunkId, capsuleId) {
+    return {
+        chunkId,
+        capsuleId,
+        sourceId: document.source_id,
+        documentId: chunk.document_id,
+        safeDisplayName: document.safe_display_name,
+        ...(hop.pageNumber !== undefined ? { pageNumber: hop.pageNumber } : {}),
+        ...(hop.pageLabel !== undefined ? { pageLabel: hop.pageLabel } : {}),
+        ...(hop.sectionPath !== undefined ? { sectionPath: hop.sectionPath } : {}),
+        ...(hop.jsonPointer !== undefined ? { jsonPointer: hop.jsonPointer } : {}),
+        ...(hop.tableName !== undefined ? { tableName: hop.tableName } : {}),
+        ...(hop.rowIndex !== undefined ? { rowIndex: hop.rowIndex } : {}),
+        ...(hop.characterStart !== undefined ? { characterStart: hop.characterStart } : {}),
+        ...(hop.characterEnd !== undefined ? { characterEnd: hop.characterEnd } : {}),
+    };
+}
+// `_capsuleSetId` is reserved for the future capsule-set-scoped lookup that retrieval
+// (#199) will need — for now the citation hop is strictly capsule-scoped so we keep
+// the API stable but ignore the parameter. The signature is exported via the barrel.
+export function mapChunkToCitation(store, capsuleId, chunkId, _capsuleSetId) {
+    const db = store._internal.db;
+    const chunk = fetchChunkRow(db, capsuleId, chunkId);
+    if (chunk === undefined)
+        return null;
+    const unit = fetchParsedUnitRow(db, capsuleId, chunk.parsed_unit_id);
+    if (unit === undefined)
+        return null;
+    const document = fetchDocumentRow(db, capsuleId, chunk.document_id);
+    if (document === undefined)
+        return null;
+    const baseHop = applyChunkSpan(hopFieldsForUnit(unit, store._internal.contentCipher), chunk);
+    const hop = attachPageHop(db, capsuleId, chunk.document_id, baseHop);
+    return buildCitation(chunk, document, hop, chunkId, capsuleId);
+}

package/dist/chunking/index.d.ts ADDED Viewed

@@ -0,0 +1,6 @@
+export { chunkDedupeKey, chunkParsedUnit, chunkingStrategyKey, resolveChunkingOptions, } from "./chunker.js";
+export { chunkDocument } from "./chunker-runner.js";
+export { mapChunkToCitation } from "./citation-mapper.js";
+export { defaultTokenEstimator, charsForTokenBudget } from "./token-estimator.js";
+export { CHUNKING_STRATEGY_VERSION, ChunkingError, DEFAULT_CHUNKING_STRATEGY_KEY, DEFAULT_MAX_CHUNKS, DEFAULT_MAX_TOKENS, DEFAULT_MIN_TOKENS, DEFAULT_OVERLAP_TOKENS, MAX_CHUNK_TOKENS, MAX_OVERLAP_TOKENS, type ChunkDocumentParams, type ChunkDocumentResult, type ChunkingOptions, type ChunkingResult, type ResolvedChunkingOptions, type TokenEstimator, } from "./types.js";
+//# sourceMappingURL=index.d.ts.map

package/dist/chunking/index.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/chunking/index.ts"],"names":[],"mappings":"AAIA,OAAO,EACL,cAAc,EACd,eAAe,EACf,mBAAmB,EACnB,sBAAsB,GACvB,MAAM,cAAc,CAAC;AACtB,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AAClF,OAAO,EACL,yBAAyB,EACzB,aAAa,EACb,6BAA6B,EAC7B,kBAAkB,EAClB,kBAAkB,EAClB,kBAAkB,EAClB,sBAAsB,EACtB,gBAAgB,EAChB,kBAAkB,EAClB,KAAK,mBAAmB,EACxB,KAAK,mBAAmB,EACxB,KAAK,eAAe,EACpB,KAAK,cAAc,EACnB,KAAK,uBAAuB,EAC5B,KAAK,cAAc,GACpB,MAAM,YAAY,CAAC"}

package/dist/chunking/index.js ADDED Viewed

@@ -0,0 +1,8 @@
+// Barrel for the chunking layer (Epic #189, Issue #195). Composed by the package barrel
+// in ../index.ts; consumers outside the package never import from this subdirectory
+// directly (ADR-0019 direction rule 3e + the trust-8 test-support naming convention).
+export { chunkDedupeKey, chunkParsedUnit, chunkingStrategyKey, resolveChunkingOptions, } from "./chunker.js";
+export { chunkDocument } from "./chunker-runner.js";
+export { mapChunkToCitation } from "./citation-mapper.js";
+export { defaultTokenEstimator, charsForTokenBudget } from "./token-estimator.js";
+export { CHUNKING_STRATEGY_VERSION, ChunkingError, DEFAULT_CHUNKING_STRATEGY_KEY, DEFAULT_MAX_CHUNKS, DEFAULT_MAX_TOKENS, DEFAULT_MIN_TOKENS, DEFAULT_OVERLAP_TOKENS, MAX_CHUNK_TOKENS, MAX_OVERLAP_TOKENS, } from "./types.js";

package/dist/chunking/token-estimator.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export declare function defaultTokenEstimator(text: string): number;
+export declare function charsForTokenBudget(tokenBudget: number): number;
+//# sourceMappingURL=token-estimator.d.ts.map

package/dist/chunking/token-estimator.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"token-estimator.d.ts","sourceRoot":"","sources":["../../src/chunking/token-estimator.ts"],"names":[],"mappings":"AAgBA,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAG1D;AAID,wBAAgB,mBAAmB,CAAC,WAAW,EAAE,MAAM,GAAG,MAAM,CAG/D"}

package/dist/chunking/token-estimator.js ADDED Viewed

@@ -0,0 +1,26 @@
+// Deterministic fallback token estimator (Epic #189, Issue #195).
+//
+// LIMITATION: this is a crude ~4-chars-per-token heuristic that mirrors what the OpenAI
+// cookbook documents as a rough rule of thumb for English text using cl100k_base. It is
+// NOT a real tokenizer — it over-estimates for CJK / code, under-estimates for languages
+// with long words, and ignores subword boundaries entirely. The point of the seam is so a
+// downstream consumer (#196 indexing orchestrator, #199 retrieval) can inject a real
+// tokenizer (e.g. `js-tiktoken`) without forcing this package to ship the dependency.
+//
+// Why not zero or one-char-per-token? Zero would let `maxTokens` produce empty chunks;
+// one-char-per-token would force absurdly small chunks (every page splits into 400-char
+// fragments). Four matches the order-of-magnitude expectation that callers seeing a
+// `maxTokens: 400` chunk get a ~1.5 KB excerpt rather than a 400-byte one.
+const CHARS_PER_TOKEN = 4;
+export function defaultTokenEstimator(text) {
+    if (text.length === 0)
+        return 0;
+    return Math.ceil(text.length / CHARS_PER_TOKEN);
+}
+// Inverse helper used by the chunker to translate a token budget into a character budget.
+// Kept here so a future tokenizer swap can override it consistently with the estimator.
+export function charsForTokenBudget(tokenBudget) {
+    if (tokenBudget <= 0)
+        return 0;
+    return tokenBudget * CHARS_PER_TOKEN;
+}

package/dist/chunking/types.d.ts ADDED Viewed

@@ -0,0 +1,49 @@
+import type { ChunkId, DocumentId, KnowledgeCapsuleId, KnowledgeSourceId } from "@oscharko-dev/keiko-contracts";
+import { KnowledgeStoreError } from "../errors.js";
+export type TokenEstimator = (text: string) => number;
+export interface ChunkingOptions {
+    readonly maxTokens?: number;
+    readonly minTokens?: number;
+    readonly overlapTokens?: number;
+    readonly maxChunks?: number;
+    readonly tokenEstimator?: TokenEstimator;
+}
+export declare const DEFAULT_MAX_TOKENS = 400;
+export declare const DEFAULT_MIN_TOKENS = 64;
+export declare const DEFAULT_OVERLAP_TOKENS = 32;
+export declare const DEFAULT_MAX_CHUNKS = 50000;
+export declare const MAX_CHUNK_TOKENS = 2048;
+export declare const MAX_OVERLAP_TOKENS = 1024;
+export declare const CHUNKING_STRATEGY_VERSION: "issue-195-v2";
+export declare const DEFAULT_CHUNKING_STRATEGY_KEY: `issue-195-v2|max=${string}|min=${string}|overlap=${string}|limit=${string}|estimator=default`;
+export interface ResolvedChunkingOptions {
+    readonly maxTokens: number;
+    readonly minTokens: number;
+    readonly overlapTokens: number;
+    readonly maxChunks: number;
+    readonly tokenEstimator: TokenEstimator;
+}
+export interface ChunkDocumentParams {
+    readonly capsuleId: KnowledgeCapsuleId;
+    readonly sourceId: KnowledgeSourceId;
+    readonly documentId: DocumentId;
+    readonly sourceText: string;
+    readonly force?: boolean;
+    readonly signal?: AbortSignal;
+}
+export interface ChunkDocumentResult {
+    readonly capsuleId: KnowledgeCapsuleId;
+    readonly documentId: DocumentId;
+    readonly chunkIds: readonly ChunkId[];
+    readonly skippedExisting: boolean;
+}
+export declare class ChunkingError extends KnowledgeStoreError {
+    readonly name: string;
+}
+export interface ChunkingResult {
+    readonly characterStart: number;
+    readonly characterEnd: number;
+    readonly tokenCount: number;
+    readonly safeExcerptHash: string;
+}
+//# sourceMappingURL=types.d.ts.map