@oscharko-dev/keiko-local-knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/bounded-document-extraction.d.ts +27 -0
- package/dist/bounded-document-extraction.d.ts.map +1 -0
- package/dist/bounded-document-extraction.js +214 -0
- package/dist/capsule-lifecycle.d.ts +33 -0
- package/dist/capsule-lifecycle.d.ts.map +1 -0
- package/dist/capsule-lifecycle.js +292 -0
- package/dist/capsule-set-lifecycle.d.ts +15 -0
- package/dist/capsule-set-lifecycle.d.ts.map +1 -0
- package/dist/capsule-set-lifecycle.js +158 -0
- package/dist/chunking/chunker-persist.d.ts +36 -0
- package/dist/chunking/chunker-persist.d.ts.map +1 -0
- package/dist/chunking/chunker-persist.js +74 -0
- package/dist/chunking/chunker-runner.d.ts +9 -0
- package/dist/chunking/chunker-runner.d.ts.map +1 -0
- package/dist/chunking/chunker-runner.js +218 -0
- package/dist/chunking/chunker.d.ts +7 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +139 -0
- package/dist/chunking/citation-mapper.d.ts +4 -0
- package/dist/chunking/citation-mapper.d.ts.map +1 -0
- package/dist/chunking/citation-mapper.js +180 -0
- package/dist/chunking/index.d.ts +6 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +8 -0
- package/dist/chunking/token-estimator.d.ts +3 -0
- package/dist/chunking/token-estimator.d.ts.map +1 -0
- package/dist/chunking/token-estimator.js +26 -0
- package/dist/chunking/types.d.ts +49 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +26 -0
- package/dist/composition.d.ts +57 -0
- package/dist/composition.d.ts.map +1 -0
- package/dist/composition.js +310 -0
- package/dist/conversation/citation-attacher.d.ts +8 -0
- package/dist/conversation/citation-attacher.d.ts.map +1 -0
- package/dist/conversation/citation-attacher.js +55 -0
- package/dist/conversation/citation-excerpts.d.ts +4 -0
- package/dist/conversation/citation-excerpts.d.ts.map +1 -0
- package/dist/conversation/citation-excerpts.js +41 -0
- package/dist/conversation/grounded-answer-runner.d.ts +9 -0
- package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
- package/dist/conversation/grounded-answer-runner.js +61 -0
- package/dist/conversation/index.d.ts +5 -0
- package/dist/conversation/index.d.ts.map +1 -0
- package/dist/conversation/index.js +7 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
- package/dist/conversation/model-gateway-answer-generator.js +105 -0
- package/dist/conversation/types.d.ts +35 -0
- package/dist/conversation/types.d.ts.map +1 -0
- package/dist/conversation/types.js +24 -0
- package/dist/discovery/discovery-runner.d.ts +23 -0
- package/dist/discovery/discovery-runner.d.ts.map +1 -0
- package/dist/discovery/discovery-runner.js +109 -0
- package/dist/discovery/extract-progressive.d.ts +17 -0
- package/dist/discovery/extract-progressive.d.ts.map +1 -0
- package/dist/discovery/extract-progressive.js +522 -0
- package/dist/discovery/extract.d.ts +26 -0
- package/dist/discovery/extract.d.ts.map +1 -0
- package/dist/discovery/extract.js +906 -0
- package/dist/discovery/glob.d.ts +10 -0
- package/dist/discovery/glob.d.ts.map +1 -0
- package/dist/discovery/glob.js +72 -0
- package/dist/discovery/index.d.ts +6 -0
- package/dist/discovery/index.d.ts.map +1 -0
- package/dist/discovery/index.js +8 -0
- package/dist/discovery/media-type.d.ts +4 -0
- package/dist/discovery/media-type.d.ts.map +1 -0
- package/dist/discovery/media-type.js +62 -0
- package/dist/discovery/persist.d.ts +63 -0
- package/dist/discovery/persist.d.ts.map +1 -0
- package/dist/discovery/persist.js +345 -0
- package/dist/discovery/test-support.d.ts +16 -0
- package/dist/discovery/test-support.d.ts.map +1 -0
- package/dist/discovery/test-support.js +127 -0
- package/dist/discovery/types.d.ts +63 -0
- package/dist/discovery/types.d.ts.map +1 -0
- package/dist/discovery/types.js +28 -0
- package/dist/discovery/walk.d.ts +12 -0
- package/dist/discovery/walk.d.ts.map +1 -0
- package/dist/discovery/walk.js +302 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +22 -0
- package/dist/evaluations/dimensions.d.ts +14 -0
- package/dist/evaluations/dimensions.d.ts.map +1 -0
- package/dist/evaluations/dimensions.js +191 -0
- package/dist/evaluations/fixtures.d.ts +18 -0
- package/dist/evaluations/fixtures.d.ts.map +1 -0
- package/dist/evaluations/fixtures.js +858 -0
- package/dist/evaluations/index.d.ts +7 -0
- package/dist/evaluations/index.d.ts.map +1 -0
- package/dist/evaluations/index.js +10 -0
- package/dist/evaluations/report.d.ts +3 -0
- package/dist/evaluations/report.d.ts.map +1 -0
- package/dist/evaluations/report.js +31 -0
- package/dist/evaluations/runner-seed.d.ts +12 -0
- package/dist/evaluations/runner-seed.d.ts.map +1 -0
- package/dist/evaluations/runner-seed.js +175 -0
- package/dist/evaluations/runner.d.ts +8 -0
- package/dist/evaluations/runner.d.ts.map +1 -0
- package/dist/evaluations/runner.js +205 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
- package/dist/evaluations/scripted-embedding-adapter.js +163 -0
- package/dist/evaluations/types.d.ts +116 -0
- package/dist/evaluations/types.d.ts.map +1 -0
- package/dist/evaluations/types.js +27 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -0
- package/dist/indexing/bounded-indexing.d.ts +41 -0
- package/dist/indexing/bounded-indexing.d.ts.map +1 -0
- package/dist/indexing/bounded-indexing.js +240 -0
- package/dist/indexing/checkpoint-persist.d.ts +8 -0
- package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
- package/dist/indexing/checkpoint-persist.js +135 -0
- package/dist/indexing/checkpoint-resume.d.ts +20 -0
- package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
- package/dist/indexing/checkpoint-resume.js +50 -0
- package/dist/indexing/embedding-batcher.d.ts +3 -0
- package/dist/indexing/embedding-batcher.d.ts.map +1 -0
- package/dist/indexing/embedding-batcher.js +390 -0
- package/dist/indexing/index.d.ts +7 -0
- package/dist/indexing/index.d.ts.map +1 -0
- package/dist/indexing/index.js +11 -0
- package/dist/indexing/job-persist.d.ts +46 -0
- package/dist/indexing/job-persist.d.ts.map +1 -0
- package/dist/indexing/job-persist.js +157 -0
- package/dist/indexing/job-resume.d.ts +4 -0
- package/dist/indexing/job-resume.d.ts.map +1 -0
- package/dist/indexing/job-resume.js +14 -0
- package/dist/indexing/orchestrator.d.ts +3 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +1151 -0
- package/dist/indexing/types.d.ts +156 -0
- package/dist/indexing/types.d.ts.map +1 -0
- package/dist/indexing/types.js +30 -0
- package/dist/indexing/vector-persist.d.ts +32 -0
- package/dist/indexing/vector-persist.d.ts.map +1 -0
- package/dist/indexing/vector-persist.js +105 -0
- package/dist/parsers/_internal.d.ts +20 -0
- package/dist/parsers/_internal.d.ts.map +1 -0
- package/dist/parsers/_internal.js +122 -0
- package/dist/parsers/csv-parser.d.ts +3 -0
- package/dist/parsers/csv-parser.d.ts.map +1 -0
- package/dist/parsers/csv-parser.js +202 -0
- package/dist/parsers/docx-parser.d.ts +3 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +390 -0
- package/dist/parsers/html-parser.d.ts +3 -0
- package/dist/parsers/html-parser.d.ts.map +1 -0
- package/dist/parsers/html-parser.js +310 -0
- package/dist/parsers/index.d.ts +15 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/index.js +41 -0
- package/dist/parsers/json-parser.d.ts +3 -0
- package/dist/parsers/json-parser.d.ts.map +1 -0
- package/dist/parsers/json-parser.js +192 -0
- package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
- package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
- package/dist/parsers/large-document/capability-discovery.js +76 -0
- package/dist/parsers/large-document/diagnostics.d.ts +3 -0
- package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
- package/dist/parsers/large-document/diagnostics.js +11 -0
- package/dist/parsers/large-document/index.d.ts +15 -0
- package/dist/parsers/large-document/index.d.ts.map +1 -0
- package/dist/parsers/large-document/index.js +10 -0
- package/dist/parsers/large-document/legacy-format.d.ts +5 -0
- package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
- package/dist/parsers/large-document/legacy-format.js +25 -0
- package/dist/parsers/large-document/preflight.d.ts +9 -0
- package/dist/parsers/large-document/preflight.d.ts.map +1 -0
- package/dist/parsers/large-document/preflight.js +43 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-extraction.js +123 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-pdf.js +145 -0
- package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
- package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
- package/dist/parsers/large-document/synthetic-source.js +101 -0
- package/dist/parsers/large-document/window-builder.d.ts +24 -0
- package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
- package/dist/parsers/large-document/window-builder.js +75 -0
- package/dist/parsers/ocr/index.d.ts +4 -0
- package/dist/parsers/ocr/index.d.ts.map +1 -0
- package/dist/parsers/ocr/index.js +4 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
- package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
- package/dist/parsers/ocr/types.d.ts +16 -0
- package/dist/parsers/ocr/types.d.ts.map +1 -0
- package/dist/parsers/ocr/types.js +4 -0
- package/dist/parsers/parser-test-fixtures.d.ts +28 -0
- package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
- package/dist/parsers/parser-test-fixtures.js +139 -0
- package/dist/parsers/pdf-parser.d.ts +43 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +388 -0
- package/dist/parsers/registry.d.ts +8 -0
- package/dist/parsers/registry.d.ts.map +1 -0
- package/dist/parsers/registry.js +57 -0
- package/dist/parsers/text-parser.d.ts +3 -0
- package/dist/parsers/text-parser.d.ts.map +1 -0
- package/dist/parsers/text-parser.js +214 -0
- package/dist/parsers/types.d.ts +53 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers/types.js +21 -0
- package/dist/parsers/unsupported-parser.d.ts +4 -0
- package/dist/parsers/unsupported-parser.d.ts.map +1 -0
- package/dist/parsers/unsupported-parser.js +97 -0
- package/dist/parsers/xlsx-parser.d.ts +3 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +425 -0
- package/dist/privacy/audit-emitter.d.ts +5 -0
- package/dist/privacy/audit-emitter.d.ts.map +1 -0
- package/dist/privacy/audit-emitter.js +93 -0
- package/dist/privacy/diagnostic-redactor.d.ts +2 -0
- package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
- package/dist/privacy/diagnostic-redactor.js +153 -0
- package/dist/privacy/index.d.ts +5 -0
- package/dist/privacy/index.d.ts.map +1 -0
- package/dist/privacy/index.js +6 -0
- package/dist/privacy/retention-applier.d.ts +5 -0
- package/dist/privacy/retention-applier.d.ts.map +1 -0
- package/dist/privacy/retention-applier.js +88 -0
- package/dist/privacy/types.d.ts +98 -0
- package/dist/privacy/types.d.ts.map +1 -0
- package/dist/privacy/types.js +12 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
- package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
- package/dist/qualityIntelligence/index.d.ts +3 -0
- package/dist/qualityIntelligence/index.d.ts.map +1 -0
- package/dist/qualityIntelligence/index.js +5 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
- package/dist/qualityIntelligence/qiHandoff.js +82 -0
- package/dist/retrieval/answer-grounding.d.ts +9 -0
- package/dist/retrieval/answer-grounding.d.ts.map +1 -0
- package/dist/retrieval/answer-grounding.js +31 -0
- package/dist/retrieval/context-pack-assembler.d.ts +24 -0
- package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
- package/dist/retrieval/context-pack-assembler.js +50 -0
- package/dist/retrieval/index.d.ts +6 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/retrieval/index.js +9 -0
- package/dist/retrieval/retrieval-runner.d.ts +10 -0
- package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
- package/dist/retrieval/retrieval-runner.js +163 -0
- package/dist/retrieval/scoped-vector-search.d.ts +24 -0
- package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
- package/dist/retrieval/scoped-vector-search.js +864 -0
- package/dist/retrieval/types.d.ts +28 -0
- package/dist/retrieval/types.d.ts.map +1 -0
- package/dist/retrieval/types.js +33 -0
- package/dist/section-path-hash.d.ts +3 -0
- package/dist/section-path-hash.d.ts.map +1 -0
- package/dist/section-path-hash.js +9 -0
- package/dist/source-lifecycle.d.ts +14 -0
- package/dist/source-lifecycle.d.ts.map +1 -0
- package/dist/source-lifecycle.js +155 -0
- package/dist/source-routing-validation.d.ts +11 -0
- package/dist/source-routing-validation.d.ts.map +1 -0
- package/dist/source-routing-validation.js +140 -0
- package/dist/store-content-cipher.d.ts +11 -0
- package/dist/store-content-cipher.d.ts.map +1 -0
- package/dist/store-content-cipher.js +67 -0
- package/dist/store-content-encryption.d.ts +12 -0
- package/dist/store-content-encryption.d.ts.map +1 -0
- package/dist/store-content-encryption.js +275 -0
- package/dist/store-paths.d.ts +6 -0
- package/dist/store-paths.d.ts.map +1 -0
- package/dist/store-paths.js +61 -0
- package/dist/store.d.ts +30 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +219 -0
- package/dist/testing.d.ts +47 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +170 -0
- package/dist/version.d.ts +2 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +4 -0
- package/package.json +43 -0
|
@@ -0,0 +1,1151 @@
|
|
|
1
|
+
// Indexing orchestrator (Epic #189, Issue #196). Composes #194 discovery, #195 chunking,
|
|
2
|
+
// and #192 embedding into a single streaming pipeline that produces `vectors` rows for a
|
|
3
|
+
// capsule. Every state change emits one `IndexingEvent`; consumers drive the AsyncIterable
|
|
4
|
+
// to back-pressure the pipeline.
|
|
5
|
+
//
|
|
6
|
+
// Pipeline shape per source:
|
|
7
|
+
//
|
|
8
|
+
// discoverAndExtract() ── (per file) ──┐
|
|
9
|
+
// ├─ document-discovered
|
|
10
|
+
// ├─ extraction skipped (unchanged): document-skipped
|
|
11
|
+
// ├─ extraction persisted: document-extracted →
|
|
12
|
+
// │ chunkDocument → document-chunked →
|
|
13
|
+
// │ embedChunkBatch* → document-embedded
|
|
14
|
+
// └─ extraction failed: document-failed
|
|
15
|
+
//
|
|
16
|
+
// Cancellation: a single `AbortSignal` flows into discovery, chunking, AND the embedding
|
|
17
|
+
// batcher. Aborting mid-document terminates the run with a `job-cancelled` event; rows
|
|
18
|
+
// already persisted for completed documents are kept (the source-of-truth for resume is
|
|
19
|
+
// the chunks/vectors tables, not the in-flight buffer).
|
|
20
|
+
//
|
|
21
|
+
// Force mode: passes `force=true` into the chunker per document so existing chunks are
|
|
22
|
+
// replaced from the current source text, then re-embeds. Discovery's incremental
|
|
23
|
+
// fast-path is bypassed (the skipped outcome is re-shaped to persisted in
|
|
24
|
+
// handleFileExtracted) so chunk-and-embed re-runs even for unchanged file hashes.
|
|
25
|
+
// Recovery mode (partial vector coverage, non-force): re-embeds using existing chunks
|
|
26
|
+
// only — the chunker runs with force=false so it reuses the already-correct chunk rows.
|
|
27
|
+
import { randomUUID } from "node:crypto";
|
|
28
|
+
import { checkpointCompatibility, DEFAULT_LARGE_DOCUMENT_RESOURCE_POLICY, LARGE_DOCUMENT_DIAGNOSTIC_CODES, largeDocumentPolicyFingerprint, } from "@oscharko-dev/keiko-contracts";
|
|
29
|
+
import { assertCompatibleEmbeddingIdentity, verifyEmbeddingCapability, } from "@oscharko-dev/keiko-model-gateway";
|
|
30
|
+
import { chunkDocument } from "../chunking/chunker-runner.js";
|
|
31
|
+
import { countChunksForDocument, deleteChunksForDocument, hasStaleChunksForDocument, } from "../chunking/chunker-persist.js";
|
|
32
|
+
import { chunkingStrategyKey } from "../chunking/index.js";
|
|
33
|
+
import { getCapsule, updateCapsuleState } from "../capsule-lifecycle.js";
|
|
34
|
+
import { discoverAndExtract } from "../discovery/discovery-runner.js";
|
|
35
|
+
import { DEFAULT_DISCOVERY_OPTIONS } from "../discovery/index.js";
|
|
36
|
+
import { deleteDocumentRow, insertDiagnosticRow, listPersistedDocumentsForSource, readDocumentTextRow, updateDocumentStatusRow, } from "../discovery/persist.js";
|
|
37
|
+
import { listCapsuleSources } from "../source-lifecycle.js";
|
|
38
|
+
import { BoundedIndexingCancelledError, BoundedIndexingPolicyError, chunkDocumentBounded, embedDocumentChunksBounded, } from "./bounded-indexing.js";
|
|
39
|
+
import { selectExtractionCheckpoint, upsertExtractionCheckpoint } from "./checkpoint-persist.js";
|
|
40
|
+
import { finalizeJobRow, isJobCancellationRequested, insertJobRow, updateJobCounters, } from "./job-persist.js";
|
|
41
|
+
import { embedChunkBatch } from "./embedding-batcher.js";
|
|
42
|
+
import { countVectorsForDocument, deleteVectorsForDocument, selectChunksForDocument, } from "./vector-persist.js";
|
|
43
|
+
import { DEFAULT_INDEXING_BATCH_SIZE, DEFAULT_INDEXING_CONCURRENCY, IndexingError, } from "./types.js";
|
|
44
|
+
// ─── Abort helper ─────────────────────────────────────────────────────────────
|
|
45
|
+
// Reads `signal?.aborted` through a function call so TypeScript's control-flow analysis
|
|
46
|
+
// does NOT narrow the optional chain after the first false branch. Mirrors the pattern in
|
|
47
|
+
// `discovery/discovery-runner.ts` and `discovery/walk.ts`.
|
|
48
|
+
function aborted(signal) {
|
|
49
|
+
return signal?.aborted === true;
|
|
50
|
+
}
|
|
51
|
+
function cancellationRequested(state) {
|
|
52
|
+
return (aborted(state.options.signal) ||
|
|
53
|
+
isJobCancellationRequested(state.options.store._internal.db, state.jobId));
|
|
54
|
+
}
|
|
55
|
+
// ─── Bounded options ──────────────────────────────────────────────────────────
|
|
56
|
+
function clampBatchSize(raw) {
|
|
57
|
+
const v = raw ?? DEFAULT_INDEXING_BATCH_SIZE;
|
|
58
|
+
return Math.max(1, Math.min(DEFAULT_INDEXING_BATCH_SIZE, Math.floor(v)));
|
|
59
|
+
}
|
|
60
|
+
function clampConcurrency(raw) {
|
|
61
|
+
const v = raw ?? DEFAULT_INDEXING_CONCURRENCY;
|
|
62
|
+
return Math.max(1, Math.min(DEFAULT_INDEXING_CONCURRENCY, Math.floor(v)));
|
|
63
|
+
}
|
|
64
|
+
function clampDiscoveryInteger(raw, fallback) {
|
|
65
|
+
if (raw === undefined || !Number.isFinite(raw))
|
|
66
|
+
return fallback;
|
|
67
|
+
return Math.max(1, Math.min(fallback, Math.floor(raw)));
|
|
68
|
+
}
|
|
69
|
+
function resolvedDiscoveryOptions(state) {
|
|
70
|
+
const raw = state.options.discoveryOptions;
|
|
71
|
+
const base = {
|
|
72
|
+
maxDepth: clampDiscoveryInteger(raw?.maxDepth, DEFAULT_DISCOVERY_OPTIONS.maxDepth),
|
|
73
|
+
maxFiles: clampDiscoveryInteger(raw?.maxFiles, DEFAULT_DISCOVERY_OPTIONS.maxFiles),
|
|
74
|
+
};
|
|
75
|
+
const signal = raw?.signal ?? state.options.signal;
|
|
76
|
+
return signal === undefined ? base : { ...base, signal };
|
|
77
|
+
}
|
|
78
|
+
// ─── Source resolution ────────────────────────────────────────────────────────
|
|
79
|
+
function resolveSources(options, capsule) {
|
|
80
|
+
const all = listCapsuleSources(options.store, capsule.id);
|
|
81
|
+
if (all.length === 0) {
|
|
82
|
+
throw new IndexingError("INVALID_OPTIONS", `Capsule ${String(capsule.id)} has no attached sources to index.`);
|
|
83
|
+
}
|
|
84
|
+
if (options.sourceIds === undefined)
|
|
85
|
+
return all;
|
|
86
|
+
const allow = new Set(options.sourceIds.map((s) => String(s)));
|
|
87
|
+
if (allow.size === 0) {
|
|
88
|
+
throw new IndexingError("INVALID_OPTIONS", "sourceIds must contain at least one source id.");
|
|
89
|
+
}
|
|
90
|
+
const selected = all.filter((s) => allow.has(String(s.id)));
|
|
91
|
+
if (selected.length !== allow.size) {
|
|
92
|
+
throw new IndexingError("INVALID_OPTIONS", "sourceIds must reference sources attached to the target capsule.");
|
|
93
|
+
}
|
|
94
|
+
return selected;
|
|
95
|
+
}
|
|
96
|
+
function buildCounters(state) {
|
|
97
|
+
return {
|
|
98
|
+
total: state.totalDocuments,
|
|
99
|
+
processed: state.processedDocuments,
|
|
100
|
+
failed: state.failedDocuments,
|
|
101
|
+
skipped: state.skippedDocuments,
|
|
102
|
+
resumeToken: state.lastResumeToken === null ? null : String(state.lastResumeToken),
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
function persistJobProgress(state) {
|
|
106
|
+
updateJobCounters(state.options.store._internal.db, state.jobId, buildCounters(state));
|
|
107
|
+
}
|
|
108
|
+
function emitProgress(options, event) {
|
|
109
|
+
if (options.progress === undefined)
|
|
110
|
+
return;
|
|
111
|
+
// Caller-provided callback; isolate so a throwing consumer cannot crash the orchestrator
|
|
112
|
+
// mid-document. Errors are surfaced as a document-failed event would be — but we never
|
|
113
|
+
// mutate state on a progress-callback throw because that would couple the caller's bug
|
|
114
|
+
// to our run accounting.
|
|
115
|
+
try {
|
|
116
|
+
options.progress(event);
|
|
117
|
+
}
|
|
118
|
+
catch {
|
|
119
|
+
// intentionally swallowed — progress sinks must not affect run correctness
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
function clearDocumentArtifacts(state, documentId, options) {
|
|
123
|
+
deleteVectorsForDocument(state.options.store._internal.db, state.capsule.id, documentId);
|
|
124
|
+
if (options.deleteChunks) {
|
|
125
|
+
deleteChunksForDocument(state.options.store._internal.db, state.capsule.id, documentId);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
function markDocumentFailed(state, documentId) {
|
|
129
|
+
updateDocumentStatusRow(state.options.store._internal.db, state.capsule.id, documentId, "failed");
|
|
130
|
+
}
|
|
131
|
+
const SELECT_CHUNKS_WITH_OFFSETS_SQL = [
|
|
132
|
+
"SELECT c.id, c.capsule_id, c.source_id, c.document_id, c.parsed_unit_id, c.order_index,",
|
|
133
|
+
" COALESCE(c.character_start, pu.character_start) AS char_start,",
|
|
134
|
+
" COALESCE(c.character_end, pu.character_end) AS char_end",
|
|
135
|
+
"FROM chunks AS c",
|
|
136
|
+
"JOIN parsed_units AS pu ON pu.capsule_id = c.capsule_id AND pu.id = c.parsed_unit_id",
|
|
137
|
+
"WHERE c.capsule_id = :c AND c.document_id = :d",
|
|
138
|
+
"ORDER BY c.order_index ASC",
|
|
139
|
+
].join(" ");
|
|
140
|
+
function selectChunkProjections(state, documentId) {
|
|
141
|
+
const rows = state.options.store._internal.db
|
|
142
|
+
.prepare(SELECT_CHUNKS_WITH_OFFSETS_SQL)
|
|
143
|
+
.all({ c: state.capsule.id, d: documentId });
|
|
144
|
+
return rows;
|
|
145
|
+
}
|
|
146
|
+
function projectChunksToEmbed(state, documentId, sourceText) {
|
|
147
|
+
const projections = selectChunkProjections(state, documentId);
|
|
148
|
+
const out = [];
|
|
149
|
+
for (const row of projections) {
|
|
150
|
+
const start = row.char_start ?? 0;
|
|
151
|
+
const end = row.char_end ?? sourceText.length;
|
|
152
|
+
const text = sourceText.slice(start, end);
|
|
153
|
+
out.push({
|
|
154
|
+
id: row.id,
|
|
155
|
+
capsuleId: row.capsule_id,
|
|
156
|
+
sourceId: row.source_id,
|
|
157
|
+
documentId,
|
|
158
|
+
text,
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
return out;
|
|
162
|
+
}
|
|
163
|
+
function scopeRootOf(source) {
|
|
164
|
+
const scope = source.scope;
|
|
165
|
+
if (scope.kind === "folder")
|
|
166
|
+
return { absoluteRoot: scope.rootPath };
|
|
167
|
+
if (scope.kind === "repository")
|
|
168
|
+
return { absoluteRoot: scope.repositoryRoot };
|
|
169
|
+
return { absoluteRoot: scope.rootPath };
|
|
170
|
+
}
|
|
171
|
+
function joinAbs(root, rel) {
|
|
172
|
+
if (root.endsWith("/"))
|
|
173
|
+
return `${root}${rel}`;
|
|
174
|
+
return `${root}/${rel}`;
|
|
175
|
+
}
|
|
176
|
+
function normaliseSep(p) {
|
|
177
|
+
return p.replace(/\\/g, "/");
|
|
178
|
+
}
|
|
179
|
+
function isContained(absoluteRoot, absolutePath) {
|
|
180
|
+
const normRoot = normaliseSep(absoluteRoot);
|
|
181
|
+
const normPath = normaliseSep(absolutePath);
|
|
182
|
+
if (normPath === normRoot)
|
|
183
|
+
return true;
|
|
184
|
+
const prefix = normRoot.endsWith("/") ? normRoot : `${normRoot}/`;
|
|
185
|
+
return normPath.startsWith(prefix);
|
|
186
|
+
}
|
|
187
|
+
function readSourceText(state, source, relativePath) {
|
|
188
|
+
const { absoluteRoot } = scopeRootOf(source);
|
|
189
|
+
const abs = joinAbs(absoluteRoot, relativePath);
|
|
190
|
+
let real;
|
|
191
|
+
try {
|
|
192
|
+
real = state.options.workspaceFs.realPath(abs);
|
|
193
|
+
}
|
|
194
|
+
catch (cause) {
|
|
195
|
+
throw new IndexingError("PERSISTENCE_FAILED", "source text could not be read before embedding", { cause });
|
|
196
|
+
}
|
|
197
|
+
if (!isContained(absoluteRoot, real)) {
|
|
198
|
+
throw new IndexingError("PERSISTENCE_FAILED", `source realpath escapes scope root before embedding: ${relativePath}`);
|
|
199
|
+
}
|
|
200
|
+
try {
|
|
201
|
+
return state.options.workspaceFs.readFileUtf8(normaliseSep(real));
|
|
202
|
+
}
|
|
203
|
+
catch (cause) {
|
|
204
|
+
throw new IndexingError("PERSISTENCE_FAILED", "source text could not be read before embedding", { cause });
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
function resolveChunkSourceText(state, documentId, source, relativePath) {
|
|
208
|
+
const persistedText = readDocumentTextRow(state.options.store._internal.db, state.options.store._internal.contentCipher, state.capsule.id, documentId);
|
|
209
|
+
if (persistedText !== undefined) {
|
|
210
|
+
return persistedText;
|
|
211
|
+
}
|
|
212
|
+
return readSourceText(state, source, relativePath);
|
|
213
|
+
}
|
|
214
|
+
// ─── Batch boundaries ─────────────────────────────────────────────────────────
|
|
215
|
+
function sliceIntoBatches(items, batchSize) {
|
|
216
|
+
if (items.length === 0)
|
|
217
|
+
return [];
|
|
218
|
+
const out = [];
|
|
219
|
+
for (let i = 0; i < items.length; i += batchSize) {
|
|
220
|
+
out.push(items.slice(i, i + batchSize));
|
|
221
|
+
}
|
|
222
|
+
return out;
|
|
223
|
+
}
|
|
224
|
+
function cancellationError() {
|
|
225
|
+
return { code: "CANCELLED", message: "indexing aborted via AbortSignal" };
|
|
226
|
+
}
|
|
227
|
+
function recordCancellationIfRequested(state, errors) {
|
|
228
|
+
if (cancellationRequested(state) && !errors.some((error) => error.code === "CANCELLED")) {
|
|
229
|
+
errors.push(cancellationError());
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
async function embedDocumentChunks(state, documentId, source, relativePath) {
|
|
233
|
+
// Text-like documents are re-read from disk; binary parsers persist a normalized text
|
|
234
|
+
// projection so chunk slicing stays aligned with extracted content.
|
|
235
|
+
const sourceText = resolveChunkSourceText(state, documentId, source, relativePath);
|
|
236
|
+
const chunks = projectChunksToEmbed(state, documentId, sourceText);
|
|
237
|
+
if (chunks.length === 0) {
|
|
238
|
+
return { vectorCount: 0, errors: [], lastChunkId: null };
|
|
239
|
+
}
|
|
240
|
+
const batches = sliceIntoBatches(chunks, state.batchSize);
|
|
241
|
+
const errors = [];
|
|
242
|
+
let vectorCount = 0;
|
|
243
|
+
let lastChunkId = null;
|
|
244
|
+
for (const batch of batches) {
|
|
245
|
+
if (cancellationRequested(state))
|
|
246
|
+
break;
|
|
247
|
+
const result = await embedChunkBatch(batch, {
|
|
248
|
+
adapter: state.options.embeddingAdapter,
|
|
249
|
+
store: state.options.store,
|
|
250
|
+
pinnedIdentity: state.capsule.embeddingModelIdentity,
|
|
251
|
+
concurrency: state.concurrency,
|
|
252
|
+
...(state.options.signal !== undefined ? { signal: state.options.signal } : {}),
|
|
253
|
+
now: state.now,
|
|
254
|
+
idSource: state.idSource,
|
|
255
|
+
});
|
|
256
|
+
vectorCount += result.vectors.length;
|
|
257
|
+
errors.push(...result.errors);
|
|
258
|
+
if (result.vectors.length > 0) {
|
|
259
|
+
const last = result.vectors[result.vectors.length - 1];
|
|
260
|
+
if (last !== undefined)
|
|
261
|
+
lastChunkId = last.chunkId;
|
|
262
|
+
}
|
|
263
|
+
// Identity-incompatibility is detected by the batcher — stop emitting further batches
|
|
264
|
+
// for this document so the orchestrator can mark the whole job failed.
|
|
265
|
+
if (result.errors.some((e) => e.code === "INCOMPATIBLE_EMBEDDING_IDENTITY")) {
|
|
266
|
+
break;
|
|
267
|
+
}
|
|
268
|
+
if (cancellationRequested(state))
|
|
269
|
+
break;
|
|
270
|
+
}
|
|
271
|
+
recordCancellationIfRequested(state, errors);
|
|
272
|
+
return { vectorCount, errors, lastChunkId };
|
|
273
|
+
}
|
|
274
|
+
// ─── Document handlers ────────────────────────────────────────────────────────
|
|
275
|
+
function handleExtractionSkipped(state, result) {
|
|
276
|
+
state.skippedDocuments += 1;
|
|
277
|
+
return {
|
|
278
|
+
kind: "document-skipped",
|
|
279
|
+
jobId: state.jobId,
|
|
280
|
+
capsuleId: state.capsule.id,
|
|
281
|
+
sourceId: result.sourceId,
|
|
282
|
+
documentId: result.outcome.kind === "skipped" ? result.outcome.document.id : "",
|
|
283
|
+
reason: "unchanged",
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
// GRD-010: transient IO failure codes that must NOT destroy a previously-good index on an
|
|
287
|
+
// incremental refresh. Mirrors the gate in discovery/extract.ts buildFailureResult.
|
|
288
|
+
const TRANSIENT_DISCOVERY_CODES = new Set(["READ_FAILED", "STAT_FAILED"]);
|
|
289
|
+
function handleExtractionFailed(state, result) {
|
|
290
|
+
const errMessage = result.outcome.kind === "failed" ? result.outcome.error.message : "extraction failed";
|
|
291
|
+
const errCode = result.outcome.kind === "failed" ? result.outcome.error.code : "READ_FAILED";
|
|
292
|
+
if (result.outcome.kind === "failed") {
|
|
293
|
+
const documentId = result.outcome.document.id;
|
|
294
|
+
// GRD-010: a transient re-read failure on a document that still has a prior good index
|
|
295
|
+
// (extract.ts preserved its chunks/vectors) is reported as a non-destructive skip, NOT a
|
|
296
|
+
// failure — the retrievable content survives until a successful re-extraction.
|
|
297
|
+
if (TRANSIENT_DISCOVERY_CODES.has(errCode) &&
|
|
298
|
+
countChunksForDocument(state.options.store._internal.db, state.capsule.id, documentId) > 0) {
|
|
299
|
+
state.skippedDocuments += 1;
|
|
300
|
+
return {
|
|
301
|
+
kind: "document-skipped",
|
|
302
|
+
jobId: state.jobId,
|
|
303
|
+
capsuleId: state.capsule.id,
|
|
304
|
+
sourceId: result.sourceId,
|
|
305
|
+
documentId,
|
|
306
|
+
reason: "unchanged",
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
state.failedDocuments += 1;
|
|
310
|
+
clearDocumentArtifacts(state, documentId, { deleteChunks: true });
|
|
311
|
+
markDocumentFailed(state, documentId);
|
|
312
|
+
const error = { code: `DISCOVERY_FAILED:${errCode}`, message: errMessage };
|
|
313
|
+
state.lastError = error;
|
|
314
|
+
return {
|
|
315
|
+
kind: "document-failed",
|
|
316
|
+
jobId: state.jobId,
|
|
317
|
+
capsuleId: state.capsule.id,
|
|
318
|
+
sourceId: result.sourceId,
|
|
319
|
+
documentId,
|
|
320
|
+
relativePath: result.relativePath,
|
|
321
|
+
error,
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
state.failedDocuments += 1;
|
|
325
|
+
const error = { code: `DISCOVERY_FAILED:${errCode}`, message: errMessage };
|
|
326
|
+
state.lastError = error;
|
|
327
|
+
return {
|
|
328
|
+
kind: "document-failed",
|
|
329
|
+
jobId: state.jobId,
|
|
330
|
+
capsuleId: state.capsule.id,
|
|
331
|
+
sourceId: result.sourceId,
|
|
332
|
+
relativePath: result.relativePath,
|
|
333
|
+
error,
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
function resolveChunkCount(state, documentId, skippedExisting, freshChunkIds) {
|
|
337
|
+
if (!skippedExisting)
|
|
338
|
+
return freshChunkIds.length;
|
|
339
|
+
// When skippedExisting, the chunks table already holds the rows from a prior run; count
|
|
340
|
+
// them so the chunked event still reports an accurate number.
|
|
341
|
+
return selectChunksForDocument(state.options.store._internal.db, state.capsule.id, documentId)
|
|
342
|
+
.length;
|
|
343
|
+
}
|
|
344
|
+
function embeddingCoverage(state, documentId) {
|
|
345
|
+
return {
|
|
346
|
+
chunkCount: countChunksForDocument(state.options.store._internal.db, state.capsule.id, documentId),
|
|
347
|
+
vectorCount: countVectorsForDocument(state.options.store._internal.db, state.capsule.id, documentId),
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
function hasCompleteVectorCoverage(state, documentId) {
|
|
351
|
+
const coverage = embeddingCoverage(state, documentId);
|
|
352
|
+
return coverage.chunkCount > 0 && coverage.vectorCount === coverage.chunkCount;
|
|
353
|
+
}
|
|
354
|
+
function persistedDocumentId(result) {
|
|
355
|
+
if (result.outcome.kind !== "persisted") {
|
|
356
|
+
throw new IndexingError("INVALID_OPTIONS", "chunkPersistedDocument called with non-persisted result");
|
|
357
|
+
}
|
|
358
|
+
return result.outcome.document.id;
|
|
359
|
+
}
|
|
360
|
+
function chunkPersistedDocument(state, result) {
|
|
361
|
+
const documentId = persistedDocumentId(result);
|
|
362
|
+
const sourceText = resolveChunkSourceText(state, documentId, sourceForResult(state, result), result.relativePath);
|
|
363
|
+
const chunkResult = chunkDocument(state.options.store, {
|
|
364
|
+
capsuleId: state.capsule.id,
|
|
365
|
+
sourceId: result.sourceId,
|
|
366
|
+
documentId,
|
|
367
|
+
sourceText,
|
|
368
|
+
force: state.options.force === true,
|
|
369
|
+
...(state.options.signal !== undefined ? { signal: state.options.signal } : {}),
|
|
370
|
+
}, state.options.chunkingOptions);
|
|
371
|
+
const chunkCount = resolveChunkCount(state, documentId, chunkResult.skippedExisting, chunkResult.chunkIds);
|
|
372
|
+
return {
|
|
373
|
+
events: chunkedDocumentEvents(state, result.sourceId, documentId, result.relativePath, chunkCount),
|
|
374
|
+
documentId,
|
|
375
|
+
chunkCount,
|
|
376
|
+
};
|
|
377
|
+
}
|
|
378
|
+
function chunkedDocumentEvents(state, sourceId, documentId, relativePath, chunkCount) {
|
|
379
|
+
return [
|
|
380
|
+
{
|
|
381
|
+
kind: "document-extracted",
|
|
382
|
+
jobId: state.jobId,
|
|
383
|
+
capsuleId: state.capsule.id,
|
|
384
|
+
sourceId,
|
|
385
|
+
documentId,
|
|
386
|
+
relativePath,
|
|
387
|
+
},
|
|
388
|
+
{
|
|
389
|
+
kind: "document-chunked",
|
|
390
|
+
jobId: state.jobId,
|
|
391
|
+
capsuleId: state.capsule.id,
|
|
392
|
+
sourceId,
|
|
393
|
+
documentId,
|
|
394
|
+
chunkCount,
|
|
395
|
+
},
|
|
396
|
+
];
|
|
397
|
+
}
|
|
398
|
+
function sourceForResult(state, result) {
|
|
399
|
+
// Sources are resolved once at job start (see buildInitialState) and cached on RunState.
|
|
400
|
+
// The capsule lifecycleState gates concurrent mutation, so the map stays consistent for
|
|
401
|
+
// the duration of the run — no per-document SELECT against capsule_sources.
|
|
402
|
+
const match = state.sourcesById.get(String(result.sourceId));
|
|
403
|
+
if (match === undefined) {
|
|
404
|
+
throw new IndexingError("INVALID_OPTIONS", `result references unknown source ${String(result.sourceId)}`);
|
|
405
|
+
}
|
|
406
|
+
return match;
|
|
407
|
+
}
|
|
408
|
+
// Incremental fast-path: skips embedding when vectors already exist (non-force run), or
|
|
409
|
+
// deletes prior vectors to prepare for a forced re-embed.
|
|
410
|
+
// Returns a PersistedHandling to short-circuit when already-embedded, undefined to continue.
|
|
411
|
+
function applyIncrementalFastPath(state, sourceId, documentId) {
|
|
412
|
+
const staleChunks = hasStaleChunksForDocument(state.options.store._internal.db, state.capsule.id, documentId, chunkingStrategyKey(state.options.chunkingOptions));
|
|
413
|
+
if (state.options.force !== true) {
|
|
414
|
+
const coverage = embeddingCoverage(state, documentId);
|
|
415
|
+
if (coverage.chunkCount > 0 && coverage.vectorCount === coverage.chunkCount && !staleChunks) {
|
|
416
|
+
state.skippedDocuments += 1;
|
|
417
|
+
return {
|
|
418
|
+
events: [
|
|
419
|
+
{
|
|
420
|
+
kind: "document-skipped",
|
|
421
|
+
jobId: state.jobId,
|
|
422
|
+
capsuleId: state.capsule.id,
|
|
423
|
+
sourceId,
|
|
424
|
+
documentId,
|
|
425
|
+
reason: "already-embedded",
|
|
426
|
+
},
|
|
427
|
+
],
|
|
428
|
+
};
|
|
429
|
+
}
|
|
430
|
+
if (coverage.vectorCount > 0) {
|
|
431
|
+
deleteVectorsForDocument(state.options.store._internal.db, state.capsule.id, documentId);
|
|
432
|
+
}
|
|
433
|
+
return undefined;
|
|
434
|
+
}
|
|
435
|
+
return undefined;
|
|
436
|
+
}
|
|
437
|
+
// Runs the chunker and returns its result, or a PersistedHandling failure event on throw.
|
|
438
|
+
function tryChunkDocument(state, result, documentId) {
|
|
439
|
+
try {
|
|
440
|
+
return { chunked: chunkPersistedDocument(state, result) };
|
|
441
|
+
}
|
|
442
|
+
catch {
|
|
443
|
+
if (cancellationRequested(state)) {
|
|
444
|
+
clearDocumentArtifacts(state, documentId, { deleteChunks: true });
|
|
445
|
+
return { events: [] };
|
|
446
|
+
}
|
|
447
|
+
state.failedDocuments += 1;
|
|
448
|
+
clearDocumentArtifacts(state, documentId, { deleteChunks: true });
|
|
449
|
+
markDocumentFailed(state, documentId);
|
|
450
|
+
const error = {
|
|
451
|
+
code: "CHUNKING_FAILED",
|
|
452
|
+
message: "document chunking failed",
|
|
453
|
+
};
|
|
454
|
+
state.lastError = error;
|
|
455
|
+
return {
|
|
456
|
+
events: [
|
|
457
|
+
{
|
|
458
|
+
kind: "document-failed",
|
|
459
|
+
jobId: state.jobId,
|
|
460
|
+
capsuleId: state.capsule.id,
|
|
461
|
+
sourceId: result.sourceId,
|
|
462
|
+
documentId,
|
|
463
|
+
relativePath: result.relativePath,
|
|
464
|
+
error,
|
|
465
|
+
},
|
|
466
|
+
],
|
|
467
|
+
};
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
function appendDocumentFailure(state, events, sourceId, documentId, relativePath, error, options) {
|
|
471
|
+
state.failedDocuments += 1;
|
|
472
|
+
clearDocumentArtifacts(state, documentId, options);
|
|
473
|
+
markDocumentFailed(state, documentId);
|
|
474
|
+
state.lastError = error;
|
|
475
|
+
events.push({
|
|
476
|
+
kind: "document-failed",
|
|
477
|
+
jobId: state.jobId,
|
|
478
|
+
capsuleId: state.capsule.id,
|
|
479
|
+
sourceId,
|
|
480
|
+
documentId,
|
|
481
|
+
relativePath,
|
|
482
|
+
error,
|
|
483
|
+
});
|
|
484
|
+
return { events };
|
|
485
|
+
}
|
|
486
|
+
function completeEmbeddedDocument(state, events, sourceId, documentId, embedResult) {
|
|
487
|
+
state.processedDocuments += 1;
|
|
488
|
+
state.vectorsPersisted += embedResult.vectorCount;
|
|
489
|
+
if (embedResult.lastChunkId !== null)
|
|
490
|
+
state.lastResumeToken = embedResult.lastChunkId;
|
|
491
|
+
events.push({
|
|
492
|
+
kind: "document-embedded",
|
|
493
|
+
jobId: state.jobId,
|
|
494
|
+
capsuleId: state.capsule.id,
|
|
495
|
+
sourceId,
|
|
496
|
+
documentId,
|
|
497
|
+
vectorCount: embedResult.vectorCount,
|
|
498
|
+
resumeToken: embedResult.lastChunkId ?? `${String(documentId)}#empty`,
|
|
499
|
+
});
|
|
500
|
+
return { events };
|
|
501
|
+
}
|
|
502
|
+
function isCancellationOnlyEmbedResult(state, embedResult) {
|
|
503
|
+
return (cancellationRequested(state) &&
|
|
504
|
+
embedResult.errors.length > 0 &&
|
|
505
|
+
embedResult.errors.every((error) => error.code === "CANCELLED"));
|
|
506
|
+
}
|
|
507
|
+
// Maps an EmbedDocumentResult into PersistedHandling events, mutating run-state counters.
|
|
508
|
+
function applyEmbedResult(state, sourceId, documentId, relativePath, priorEvents, embedResult) {
|
|
509
|
+
const events = [...priorEvents];
|
|
510
|
+
if (isCancellationOnlyEmbedResult(state, embedResult)) {
|
|
511
|
+
return { events };
|
|
512
|
+
}
|
|
513
|
+
const identityErr = embedResult.errors.find((e) => e.code === "INCOMPATIBLE_EMBEDDING_IDENTITY");
|
|
514
|
+
if (identityErr !== undefined) {
|
|
515
|
+
return {
|
|
516
|
+
...appendDocumentFailure(state, events, sourceId, documentId, relativePath, identityErr, {
|
|
517
|
+
deleteChunks: false,
|
|
518
|
+
}),
|
|
519
|
+
identityFailure: identityErr,
|
|
520
|
+
};
|
|
521
|
+
}
|
|
522
|
+
if (embedResult.errors.length > 0) {
|
|
523
|
+
const firstErr = embedResult.errors[0] ?? {
|
|
524
|
+
code: "EMBEDDING_ADAPTER_FAILED",
|
|
525
|
+
message: "embedding adapter failed",
|
|
526
|
+
};
|
|
527
|
+
return appendDocumentFailure(state, events, sourceId, documentId, relativePath, firstErr, {
|
|
528
|
+
deleteChunks: false,
|
|
529
|
+
});
|
|
530
|
+
}
|
|
531
|
+
return completeEmbeddedDocument(state, events, sourceId, documentId, embedResult);
|
|
532
|
+
}
|
|
533
|
+
function* persistedEvents(handling) {
|
|
534
|
+
for (const event of handling.events) {
|
|
535
|
+
yield event;
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
// ─── Bounded large-document chunk + embed + resume (Epic #1160, Issue #1286) ─────
|
|
539
|
+
function boundedCurrentFingerprint(state, checkpoint) {
|
|
540
|
+
const policy = boundedPolicy(state);
|
|
541
|
+
return {
|
|
542
|
+
...checkpoint.fingerprint,
|
|
543
|
+
policyFingerprint: largeDocumentPolicyFingerprint(policy),
|
|
544
|
+
chunkingStrategyVersion: chunkingStrategyKey(state.options.chunkingOptions),
|
|
545
|
+
embeddingIdentity: state.capsule.embeddingModelIdentity,
|
|
546
|
+
};
|
|
547
|
+
}
|
|
548
|
+
function boundedPolicy(state) {
|
|
549
|
+
return state.options.largeDocumentPolicy ?? DEFAULT_LARGE_DOCUMENT_RESOURCE_POLICY;
|
|
550
|
+
}
|
|
551
|
+
function writeBoundedCheckpoint(state, checkpoint, fingerprint, phase, chunkCursor, embeddedChunkCursor, lastEmbeddedChunkId, terminalDiagnostics = checkpoint.terminalDiagnostics) {
|
|
552
|
+
upsertExtractionCheckpoint(state.options.store._internal.db, {
|
|
553
|
+
capsuleId: checkpoint.capsuleId,
|
|
554
|
+
documentId: checkpoint.documentId,
|
|
555
|
+
jobId: state.jobId,
|
|
556
|
+
strategy: checkpoint.strategy,
|
|
557
|
+
phase,
|
|
558
|
+
pageCursor: checkpoint.pageCursor,
|
|
559
|
+
sectionCursor: checkpoint.sectionCursor,
|
|
560
|
+
objectCursor: checkpoint.objectCursor,
|
|
561
|
+
extractedTextBytes: checkpoint.extractedTextBytes,
|
|
562
|
+
chunkCursor,
|
|
563
|
+
embeddedChunkCursor,
|
|
564
|
+
...(lastEmbeddedChunkId !== null ? { lastEmbeddedChunkId } : {}),
|
|
565
|
+
retryCount: checkpoint.retryCount,
|
|
566
|
+
coverage: checkpoint.coverage,
|
|
567
|
+
fingerprint,
|
|
568
|
+
terminalDiagnostics,
|
|
569
|
+
createdAt: checkpoint.createdAt,
|
|
570
|
+
updatedAt: state.now(),
|
|
571
|
+
});
|
|
572
|
+
}
|
|
573
|
+
function persistCheckpointIncompatibleDiagnostic(state, documentId, reasons) {
|
|
574
|
+
insertDiagnosticRow(state.options.store._internal.db, {
|
|
575
|
+
id: `${String(documentId)}#checkpoint-incompatible`,
|
|
576
|
+
capsuleId: state.capsule.id,
|
|
577
|
+
diagnostic: {
|
|
578
|
+
severity: "warning",
|
|
579
|
+
code: LARGE_DOCUMENT_DIAGNOSTIC_CODES.CHECKPOINT_INCOMPATIBLE,
|
|
580
|
+
message: `resume refused and restarted; changed: ${reasons.join(", ")}`,
|
|
581
|
+
documentId,
|
|
582
|
+
},
|
|
583
|
+
createdAt: state.now(),
|
|
584
|
+
});
|
|
585
|
+
}
|
|
586
|
+
function boundedNeedsRechunk(state, documentId, fingerprint, incompatible) {
|
|
587
|
+
const db = state.options.store._internal.db;
|
|
588
|
+
return (incompatible ||
|
|
589
|
+
state.options.force === true ||
|
|
590
|
+
countChunksForDocument(db, state.capsule.id, documentId) === 0 ||
|
|
591
|
+
hasStaleChunksForDocument(db, state.capsule.id, documentId, fingerprint.chunkingStrategyVersion));
|
|
592
|
+
}
|
|
593
|
+
function boundedEmbedDeps(state, documentId, fingerprint, checkpoint, chunkCount) {
|
|
594
|
+
return {
|
|
595
|
+
store: state.options.store,
|
|
596
|
+
capsuleId: state.capsule.id,
|
|
597
|
+
documentId,
|
|
598
|
+
adapter: state.options.embeddingAdapter,
|
|
599
|
+
identity: state.capsule.embeddingModelIdentity,
|
|
600
|
+
batchSize: state.batchSize,
|
|
601
|
+
concurrency: state.concurrency,
|
|
602
|
+
now: state.now,
|
|
603
|
+
idSource: state.idSource,
|
|
604
|
+
policy: boundedPolicy(state),
|
|
605
|
+
...(state.options.signal !== undefined ? { signal: state.options.signal } : {}),
|
|
606
|
+
onBatch: (cursor, lastId) => {
|
|
607
|
+
writeBoundedCheckpoint(state, checkpoint, fingerprint, "embedding", chunkCount, cursor, lastId);
|
|
608
|
+
},
|
|
609
|
+
};
|
|
610
|
+
}
|
|
611
|
+
// Bounded path for a progressively-extracted document: resumes a compatible checkpoint, restarts an
|
|
612
|
+
// incompatible one with a CHECKPOINT_INCOMPATIBLE diagnostic, chunks + embeds through SUBSTR-backed
|
|
613
|
+
// readers, and advances the durable checkpoint between batches.
|
|
614
|
+
// Reconciles existing chunks/vectors against the current fingerprint: refuses an incompatible
|
|
615
|
+
// checkpoint (diagnostic + delete vectors) and re-chunks when stale/forced/missing. Returns the
|
|
616
|
+
// chunk count.
|
|
617
|
+
function prepareBoundedChunks(state, result, documentId, checkpoint, fingerprint) {
|
|
618
|
+
const db = state.options.store._internal.db;
|
|
619
|
+
const compat = checkpointCompatibility(checkpoint.fingerprint, fingerprint);
|
|
620
|
+
if (!compat.compatible) {
|
|
621
|
+
persistCheckpointIncompatibleDiagnostic(state, documentId, compat.reasons);
|
|
622
|
+
deleteVectorsForDocument(db, state.capsule.id, documentId);
|
|
623
|
+
}
|
|
624
|
+
if (boundedNeedsRechunk(state, documentId, fingerprint, !compat.compatible)) {
|
|
625
|
+
chunkDocumentBounded(state.options.store, { capsuleId: state.capsule.id, sourceId: result.sourceId, documentId }, state.options.chunkingOptions, state.options.signal, boundedPolicy(state));
|
|
626
|
+
deleteVectorsForDocument(db, state.capsule.id, documentId);
|
|
627
|
+
}
|
|
628
|
+
return countChunksForDocument(db, state.capsule.id, documentId);
|
|
629
|
+
}
|
|
630
|
+
function boundedChunkPreparationFailure(state, result, documentId, checkpoint, fingerprint, cause) {
|
|
631
|
+
const db = state.options.store._internal.db;
|
|
632
|
+
if (cause instanceof BoundedIndexingCancelledError || cancellationRequested(state)) {
|
|
633
|
+
updateDocumentStatusRow(db, state.capsule.id, documentId, "pending");
|
|
634
|
+
writeBoundedCheckpoint(state, checkpoint, fingerprint, "cancelled", countChunksForDocument(db, state.capsule.id, documentId), countVectorsForDocument(db, state.capsule.id, documentId), null);
|
|
635
|
+
return { events: [] };
|
|
636
|
+
}
|
|
637
|
+
const error = cause instanceof BoundedIndexingPolicyError
|
|
638
|
+
? cause.toIndexingError()
|
|
639
|
+
: { code: "CHUNKING_FAILED", message: "document chunking failed" };
|
|
640
|
+
writeBoundedCheckpoint(state, checkpoint, fingerprint, "failed", countChunksForDocument(db, state.capsule.id, documentId), countVectorsForDocument(db, state.capsule.id, documentId), null, [{ severity: "error", code: error.code, message: error.message, documentId }]);
|
|
641
|
+
return appendDocumentFailure(state, [], result.sourceId, documentId, result.relativePath, error, {
|
|
642
|
+
deleteChunks: true,
|
|
643
|
+
});
|
|
644
|
+
}
|
|
645
|
+
function prepareBoundedChunksSafely(state, result, documentId, checkpoint, fingerprint) {
|
|
646
|
+
try {
|
|
647
|
+
return { chunkCount: prepareBoundedChunks(state, result, documentId, checkpoint, fingerprint) };
|
|
648
|
+
}
|
|
649
|
+
catch (cause) {
|
|
650
|
+
return boundedChunkPreparationFailure(state, result, documentId, checkpoint, fingerprint, cause);
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
function persistBoundedEmbedCheckpoint(state, checkpoint, fingerprint, documentId, chunkCount, embedResult) {
|
|
654
|
+
writeBoundedCheckpoint(state, checkpoint, fingerprint, embedResult.errors.length === 0 ? "complete" : "failed", chunkCount, embedResult.embeddedCursor, embedResult.lastChunkId, embedResult.errors.map((error) => ({
|
|
655
|
+
severity: "error",
|
|
656
|
+
code: error.code,
|
|
657
|
+
message: error.message,
|
|
658
|
+
documentId,
|
|
659
|
+
})));
|
|
660
|
+
}
|
|
661
|
+
function persistBoundedEmbedCancellation(state, checkpoint, fingerprint, documentId, chunkCount, embedResult) {
|
|
662
|
+
updateDocumentStatusRow(state.options.store._internal.db, state.capsule.id, documentId, "pending");
|
|
663
|
+
writeBoundedCheckpoint(state, checkpoint, fingerprint, "cancelled", chunkCount, embedResult.embeddedCursor, embedResult.lastChunkId, embedResult.errors.map((error) => ({
|
|
664
|
+
severity: "info",
|
|
665
|
+
code: error.code,
|
|
666
|
+
message: error.message,
|
|
667
|
+
documentId,
|
|
668
|
+
})));
|
|
669
|
+
}
|
|
670
|
+
async function persistBoundedEmbeddingResult(state, result, documentId, checkpoint, fingerprint, chunkCount) {
|
|
671
|
+
// The bounded embed self-resumes from chunks that have no vector yet.
|
|
672
|
+
const embedResult = await embedDocumentChunksBounded(boundedEmbedDeps(state, documentId, fingerprint, checkpoint, chunkCount));
|
|
673
|
+
if (embedResult.cancelled) {
|
|
674
|
+
persistBoundedEmbedCancellation(state, checkpoint, fingerprint, documentId, chunkCount, embedResult);
|
|
675
|
+
return undefined;
|
|
676
|
+
}
|
|
677
|
+
persistBoundedEmbedCheckpoint(state, checkpoint, fingerprint, documentId, chunkCount, embedResult);
|
|
678
|
+
return applyEmbedResult(state, result.sourceId, documentId, result.relativePath, [], {
|
|
679
|
+
vectorCount: embedResult.vectorCount,
|
|
680
|
+
errors: embedResult.errors,
|
|
681
|
+
lastChunkId: embedResult.lastChunkId,
|
|
682
|
+
});
|
|
683
|
+
}
|
|
684
|
+
async function* handleBoundedDocument(state, result, documentId, checkpoint) {
|
|
685
|
+
const db = state.options.store._internal.db;
|
|
686
|
+
const sourceId = result.sourceId;
|
|
687
|
+
const fingerprint = boundedCurrentFingerprint(state, checkpoint);
|
|
688
|
+
const prepared = prepareBoundedChunksSafely(state, result, documentId, checkpoint, fingerprint);
|
|
689
|
+
if (!("chunkCount" in prepared)) {
|
|
690
|
+
yield* persistedEvents(prepared);
|
|
691
|
+
return;
|
|
692
|
+
}
|
|
693
|
+
const { chunkCount } = prepared;
|
|
694
|
+
yield* chunkedDocumentEvents(state, sourceId, documentId, result.relativePath, chunkCount);
|
|
695
|
+
persistJobProgress(state);
|
|
696
|
+
const alreadyEmbedded = countVectorsForDocument(db, state.capsule.id, documentId);
|
|
697
|
+
writeBoundedCheckpoint(state, checkpoint, fingerprint, "embedding", chunkCount, alreadyEmbedded, null);
|
|
698
|
+
const embedded = await persistBoundedEmbeddingResult(state, result, documentId, checkpoint, fingerprint, chunkCount);
|
|
699
|
+
if (embedded !== undefined)
|
|
700
|
+
yield* persistedEvents(embedded);
|
|
701
|
+
}
|
|
702
|
+
// Wraps the chunk-then-embed pipeline for a single persisted document. Extraction/chunking
|
|
703
|
+
// events are yielded before awaiting embeddings, so progress consumers see pre-model work
|
|
704
|
+
// immediately instead of only after all embedding batches finish.
|
|
705
|
+
function* handleUnsupportedDocument(state, result, documentId) {
|
|
706
|
+
clearDocumentArtifacts(state, documentId, { deleteChunks: true });
|
|
707
|
+
state.skippedDocuments += 1;
|
|
708
|
+
yield {
|
|
709
|
+
kind: "document-extracted",
|
|
710
|
+
jobId: state.jobId,
|
|
711
|
+
capsuleId: state.capsule.id,
|
|
712
|
+
sourceId: result.sourceId,
|
|
713
|
+
documentId,
|
|
714
|
+
relativePath: result.relativePath,
|
|
715
|
+
};
|
|
716
|
+
yield {
|
|
717
|
+
kind: "document-skipped",
|
|
718
|
+
jobId: state.jobId,
|
|
719
|
+
capsuleId: state.capsule.id,
|
|
720
|
+
sourceId: result.sourceId,
|
|
721
|
+
documentId,
|
|
722
|
+
reason: "unsupported",
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
async function* handlePersistedDocument(state, result) {
|
|
726
|
+
const documentId = result.outcome.kind === "persisted" ? result.outcome.document.id : null;
|
|
727
|
+
if (documentId === null)
|
|
728
|
+
return;
|
|
729
|
+
if (result.outcome.document.status === "unsupported") {
|
|
730
|
+
yield* handleUnsupportedDocument(state, result, documentId);
|
|
731
|
+
return;
|
|
732
|
+
}
|
|
733
|
+
// A document with a durable extraction checkpoint took the progressive page-windowed path; route
|
|
734
|
+
// it to the bounded chunk/embed pass (which owns its own resume + fast-path logic).
|
|
735
|
+
const checkpoint = selectExtractionCheckpoint(state.options.store._internal.db, state.capsule.id, documentId);
|
|
736
|
+
if (checkpoint !== undefined) {
|
|
737
|
+
yield* handleBoundedDocument(state, result, documentId, checkpoint);
|
|
738
|
+
return;
|
|
739
|
+
}
|
|
740
|
+
const fastPath = applyIncrementalFastPath(state, result.sourceId, documentId);
|
|
741
|
+
if (fastPath !== undefined) {
|
|
742
|
+
yield* persistedEvents(fastPath);
|
|
743
|
+
return;
|
|
744
|
+
}
|
|
745
|
+
const chunkStep = tryChunkDocument(state, result, documentId);
|
|
746
|
+
if (!("chunked" in chunkStep)) {
|
|
747
|
+
yield* persistedEvents(chunkStep);
|
|
748
|
+
return;
|
|
749
|
+
}
|
|
750
|
+
yield* chunkStep.chunked.events;
|
|
751
|
+
persistJobProgress(state);
|
|
752
|
+
const embedResult = await embedDocumentChunks(state, documentId, sourceForResult(state, result), result.relativePath);
|
|
753
|
+
yield* persistedEvents(applyEmbedResult(state, result.sourceId, documentId, result.relativePath, [], embedResult));
|
|
754
|
+
}
|
|
755
|
+
function* handleExtractionSkippedEvents(state, result) {
|
|
756
|
+
yield handleExtractionSkipped(state, result);
|
|
757
|
+
}
|
|
758
|
+
function* handleExtractionFailedEvents(state, result) {
|
|
759
|
+
yield handleExtractionFailed(state, result);
|
|
760
|
+
}
|
|
761
|
+
// Routes a file-extracted event: force-skipped docs are re-shaped to persisted so the
|
|
762
|
+
// standard chunk-and-embed pipeline runs on them.
|
|
763
|
+
async function* handleFileExtracted(state, result) {
|
|
764
|
+
if (result.outcome.kind === "skipped") {
|
|
765
|
+
// In force mode, an "unchanged" document still needs chunk-and-embed because the
|
|
766
|
+
// caller explicitly requested a fresh embedding pass. Re-shape the skipped outcome as
|
|
767
|
+
// a persisted outcome (the document row exists and is valid) so the standard pipeline
|
|
768
|
+
// runs. Outside force/recovery mode, surface the skip as-is.
|
|
769
|
+
const staleChunks = hasStaleChunksForDocument(state.options.store._internal.db, state.capsule.id, result.outcome.document.id, chunkingStrategyKey(state.options.chunkingOptions));
|
|
770
|
+
const missingVectors = result.outcome.document.status === "extracted" &&
|
|
771
|
+
!hasCompleteVectorCoverage(state, result.outcome.document.id);
|
|
772
|
+
if (state.options.force === true || staleChunks || missingVectors) {
|
|
773
|
+
const synthetic = {
|
|
774
|
+
capsuleId: result.capsuleId,
|
|
775
|
+
sourceId: result.sourceId,
|
|
776
|
+
relativePath: result.relativePath,
|
|
777
|
+
outcome: { kind: "persisted", document: result.outcome.document },
|
|
778
|
+
diagnostics: result.diagnostics,
|
|
779
|
+
};
|
|
780
|
+
yield* handlePersistedDocument(state, synthetic);
|
|
781
|
+
return;
|
|
782
|
+
}
|
|
783
|
+
yield* handleExtractionSkippedEvents(state, result);
|
|
784
|
+
return;
|
|
785
|
+
}
|
|
786
|
+
if (result.outcome.kind === "failed") {
|
|
787
|
+
yield* handleExtractionFailedEvents(state, result);
|
|
788
|
+
return;
|
|
789
|
+
}
|
|
790
|
+
yield* handlePersistedDocument(state, result);
|
|
791
|
+
}
|
|
792
|
+
async function* handleDiscoveryEvent(state, source, evt) {
|
|
793
|
+
if (evt.kind === "file-discovered") {
|
|
794
|
+
state.totalDocuments += 1;
|
|
795
|
+
yield {
|
|
796
|
+
kind: "document-discovered",
|
|
797
|
+
jobId: state.jobId,
|
|
798
|
+
capsuleId: state.capsule.id,
|
|
799
|
+
sourceId: source.id,
|
|
800
|
+
relativePath: evt.relativePath,
|
|
801
|
+
sizeBytes: evt.sizeBytes,
|
|
802
|
+
};
|
|
803
|
+
return;
|
|
804
|
+
}
|
|
805
|
+
if (evt.kind === "scope-error") {
|
|
806
|
+
state.failedDocuments += 1;
|
|
807
|
+
const err = {
|
|
808
|
+
code: `DISCOVERY_FAILED:${evt.error.code}`,
|
|
809
|
+
message: evt.error.message,
|
|
810
|
+
};
|
|
811
|
+
state.lastError = err;
|
|
812
|
+
yield {
|
|
813
|
+
kind: "document-failed",
|
|
814
|
+
jobId: state.jobId,
|
|
815
|
+
capsuleId: state.capsule.id,
|
|
816
|
+
sourceId: source.id,
|
|
817
|
+
...(evt.error.relativePath !== undefined ? { relativePath: evt.error.relativePath } : {}),
|
|
818
|
+
error: err,
|
|
819
|
+
};
|
|
820
|
+
return;
|
|
821
|
+
}
|
|
822
|
+
if (evt.kind === "cancelled" || evt.kind === "completed") {
|
|
823
|
+
// No-op at this level: the outer loop drives terminal events.
|
|
824
|
+
return;
|
|
825
|
+
}
|
|
826
|
+
// evt.kind === "file-extracted"
|
|
827
|
+
yield* handleFileExtracted(state, evt.result);
|
|
828
|
+
}
|
|
829
|
+
function shouldStopAfterEvent(event) {
|
|
830
|
+
return event.kind === "document-failed" && event.error.code === "INCOMPATIBLE_EMBEDDING_IDENTITY";
|
|
831
|
+
}
|
|
832
|
+
async function* streamDiscoveryEvent(state, source, evt) {
|
|
833
|
+
for await (const event of handleDiscoveryEvent(state, source, evt)) {
|
|
834
|
+
persistJobProgress(state);
|
|
835
|
+
yield event;
|
|
836
|
+
if (shouldStopAfterEvent(event)) {
|
|
837
|
+
return true;
|
|
838
|
+
}
|
|
839
|
+
}
|
|
840
|
+
return false;
|
|
841
|
+
}
|
|
842
|
+
// ─── Per-source pipeline ──────────────────────────────────────────────────────
|
|
843
|
+
async function* runOneSource(state, source) {
|
|
844
|
+
const stream = discoverAndExtract({
|
|
845
|
+
fs: state.options.workspaceFs,
|
|
846
|
+
store: state.options.store,
|
|
847
|
+
parserRegistry: state.options.parserRegistry,
|
|
848
|
+
...(state.options.largeDocumentPolicy !== undefined
|
|
849
|
+
? { largeDocumentPolicy: state.options.largeDocumentPolicy }
|
|
850
|
+
: {}),
|
|
851
|
+
...(state.options.progressiveExtractors !== undefined
|
|
852
|
+
? { progressiveExtractors: state.options.progressiveExtractors }
|
|
853
|
+
: {}),
|
|
854
|
+
...(state.options.extractionCapabilities !== undefined
|
|
855
|
+
? { extractionCapabilities: state.options.extractionCapabilities }
|
|
856
|
+
: {}),
|
|
857
|
+
largeDocumentJobId: state.jobId,
|
|
858
|
+
chunkingStrategyVersion: chunkingStrategyKey(state.options.chunkingOptions),
|
|
859
|
+
}, sourceDiscoveryParams(state, source));
|
|
860
|
+
const progress = {
|
|
861
|
+
cancelled: false,
|
|
862
|
+
sawScopeError: false,
|
|
863
|
+
completed: false,
|
|
864
|
+
discoveredPaths: new Set(),
|
|
865
|
+
};
|
|
866
|
+
for await (const evt of stream) {
|
|
867
|
+
observeSourceEvent(progress, evt);
|
|
868
|
+
if (cancellationRequested(state)) {
|
|
869
|
+
progress.cancelled = true;
|
|
870
|
+
break;
|
|
871
|
+
}
|
|
872
|
+
const shouldStop = yield* streamDiscoveryEvent(state, source, evt);
|
|
873
|
+
if (shouldStop) {
|
|
874
|
+
return;
|
|
875
|
+
}
|
|
876
|
+
// After yielding a batch we re-check the signal — the consumer's awaiting iterator
|
|
877
|
+
// may have aborted between events.
|
|
878
|
+
if (cancellationRequested(state)) {
|
|
879
|
+
progress.cancelled = true;
|
|
880
|
+
break;
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
finalizeSourceRun(state, source, progress);
|
|
884
|
+
}
|
|
885
|
+
function sourceDiscoveryParams(state, source) {
|
|
886
|
+
return {
|
|
887
|
+
capsuleId: state.capsule.id,
|
|
888
|
+
source,
|
|
889
|
+
discovery: resolvedDiscoveryOptions(state),
|
|
890
|
+
};
|
|
891
|
+
}
|
|
892
|
+
function observeSourceEvent(progress, evt) {
|
|
893
|
+
if (evt.kind === "file-discovered") {
|
|
894
|
+
progress.discoveredPaths.add(evt.relativePath);
|
|
895
|
+
return;
|
|
896
|
+
}
|
|
897
|
+
if (evt.kind === "scope-error") {
|
|
898
|
+
progress.sawScopeError = true;
|
|
899
|
+
return;
|
|
900
|
+
}
|
|
901
|
+
if (evt.kind === "cancelled") {
|
|
902
|
+
progress.cancelled = true;
|
|
903
|
+
return;
|
|
904
|
+
}
|
|
905
|
+
if (evt.kind === "completed") {
|
|
906
|
+
progress.completed = true;
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
function pruneDeletedSourceDocuments(state, source, discoveredPaths) {
|
|
910
|
+
const persisted = listPersistedDocumentsForSource(state.options.store._internal.db, state.capsule.id, source.id);
|
|
911
|
+
for (const document of persisted) {
|
|
912
|
+
if (discoveredPaths.has(document.document_path))
|
|
913
|
+
continue;
|
|
914
|
+
deleteDocumentRow(state.options.store._internal.db, state.capsule.id, document.id);
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
function finalizeSourceRun(state, source, progress) {
|
|
918
|
+
if (progress.cancelled)
|
|
919
|
+
return;
|
|
920
|
+
if (!progress.completed || progress.sawScopeError)
|
|
921
|
+
return;
|
|
922
|
+
if (progress.discoveredPaths.size >= resolvedDiscoveryOptions(state).maxFiles)
|
|
923
|
+
return;
|
|
924
|
+
pruneDeletedSourceDocuments(state, source, progress.discoveredPaths);
|
|
925
|
+
}
|
|
926
|
+
// ─── Capsule resolution + job lifecycle ───────────────────────────────────────
|
|
927
|
+
function resolveCapsule(options) {
|
|
928
|
+
const capsule = getCapsule(options.store, options.capsuleId);
|
|
929
|
+
if (capsule === undefined) {
|
|
930
|
+
throw new IndexingError("CAPSULE_NOT_FOUND", `capsule not found: ${String(options.capsuleId)}`);
|
|
931
|
+
}
|
|
932
|
+
return capsule;
|
|
933
|
+
}
|
|
934
|
+
function buildInitialState(options, capsule, sources, jobId, startedAt) {
|
|
935
|
+
const sourcesById = new Map();
|
|
936
|
+
for (const source of sources)
|
|
937
|
+
sourcesById.set(String(source.id), source);
|
|
938
|
+
return {
|
|
939
|
+
jobId,
|
|
940
|
+
capsule,
|
|
941
|
+
options,
|
|
942
|
+
batchSize: clampBatchSize(options.batchSize),
|
|
943
|
+
concurrency: clampConcurrency(options.concurrency),
|
|
944
|
+
now: options.now ?? options.store._internal.now,
|
|
945
|
+
idSource: options.idSource ?? (() => randomUUID()),
|
|
946
|
+
startedAt,
|
|
947
|
+
sourcesById,
|
|
948
|
+
totalDocuments: 0,
|
|
949
|
+
processedDocuments: 0,
|
|
950
|
+
failedDocuments: 0,
|
|
951
|
+
skippedDocuments: 0,
|
|
952
|
+
vectorsPersisted: 0,
|
|
953
|
+
lastResumeToken: null,
|
|
954
|
+
};
|
|
955
|
+
}
|
|
956
|
+
function buildResult(state, status, finishedAt) {
|
|
957
|
+
return {
|
|
958
|
+
jobId: state.jobId,
|
|
959
|
+
capsuleId: state.capsule.id,
|
|
960
|
+
status,
|
|
961
|
+
totalDocuments: state.totalDocuments,
|
|
962
|
+
processedDocuments: state.processedDocuments,
|
|
963
|
+
failedDocuments: state.failedDocuments,
|
|
964
|
+
skippedDocuments: state.skippedDocuments,
|
|
965
|
+
vectorsPersisted: state.vectorsPersisted,
|
|
966
|
+
startedAt: state.startedAt,
|
|
967
|
+
finishedAt,
|
|
968
|
+
...(state.lastError !== undefined ? { lastError: state.lastError } : {}),
|
|
969
|
+
embeddingIdentity: state.capsule.embeddingModelIdentity,
|
|
970
|
+
};
|
|
971
|
+
}
|
|
972
|
+
async function verifyEmbeddingPreflight(state) {
|
|
973
|
+
try {
|
|
974
|
+
const result = await verifyEmbeddingCapability(state.options.embeddingAdapter, {
|
|
975
|
+
modelId: state.capsule.embeddingModelIdentity.modelId,
|
|
976
|
+
provider: state.capsule.embeddingModelIdentity.provider,
|
|
977
|
+
vectorMetric: state.capsule.embeddingModelIdentity.vectorMetric,
|
|
978
|
+
expectedDimensions: state.capsule.embeddingModelIdentity.vectorDimensions,
|
|
979
|
+
...(state.options.signal !== undefined ? { signal: state.options.signal } : {}),
|
|
980
|
+
});
|
|
981
|
+
if (result.ok) {
|
|
982
|
+
const compatibility = assertCompatibleEmbeddingIdentity(state.capsule.embeddingModelIdentity, result.identity);
|
|
983
|
+
if (compatibility.ok)
|
|
984
|
+
return undefined;
|
|
985
|
+
return {
|
|
986
|
+
code: "INCOMPATIBLE_EMBEDDING_IDENTITY",
|
|
987
|
+
message: compatibility.safeMessage,
|
|
988
|
+
};
|
|
989
|
+
}
|
|
990
|
+
return {
|
|
991
|
+
code: result.reason === "dimension-mismatch"
|
|
992
|
+
? "INCOMPATIBLE_EMBEDDING_IDENTITY"
|
|
993
|
+
: "EMBEDDING_ADAPTER_FAILED",
|
|
994
|
+
message: result.safeMessage,
|
|
995
|
+
};
|
|
996
|
+
}
|
|
997
|
+
catch (cause) {
|
|
998
|
+
if (cancellationRequested(state) ||
|
|
999
|
+
(cause instanceof DOMException && cause.name === "AbortError")) {
|
|
1000
|
+
return { code: "CANCELLED", message: "indexing aborted via AbortSignal" };
|
|
1001
|
+
}
|
|
1002
|
+
return {
|
|
1003
|
+
code: "EMBEDDING_ADAPTER_FAILED",
|
|
1004
|
+
message: "embedding capability preflight failed before indexing started",
|
|
1005
|
+
};
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
function persistStartedJob(state, sources) {
|
|
1009
|
+
insertJobRow(state.options.store._internal.db, {
|
|
1010
|
+
id: state.jobId,
|
|
1011
|
+
capsuleId: state.capsule.id,
|
|
1012
|
+
sourceIds: sources.map((source) => source.id),
|
|
1013
|
+
startedAt: state.startedAt,
|
|
1014
|
+
});
|
|
1015
|
+
try {
|
|
1016
|
+
updateCapsuleState(state.options.store, state.capsule.id, "indexing");
|
|
1017
|
+
}
|
|
1018
|
+
catch {
|
|
1019
|
+
// The capsule state column is informational — failing to flip it must not abort the
|
|
1020
|
+
// run. The events stream remains the source of truth.
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
function sourceIdsForState(state) {
|
|
1024
|
+
return [...state.sourcesById.values()].map((source) => source.id);
|
|
1025
|
+
}
|
|
1026
|
+
function emitJobStarted(state, sources) {
|
|
1027
|
+
const event = {
|
|
1028
|
+
kind: "job-started",
|
|
1029
|
+
jobId: state.jobId,
|
|
1030
|
+
capsuleId: state.capsule.id,
|
|
1031
|
+
sourceIds: sources.map((source) => source.id),
|
|
1032
|
+
startedAt: state.startedAt,
|
|
1033
|
+
};
|
|
1034
|
+
state.options.auditSink?.emit({
|
|
1035
|
+
kind: "indexing-job-started",
|
|
1036
|
+
capsuleId: state.capsule.id,
|
|
1037
|
+
sourceIds: sources.map((source) => source.id),
|
|
1038
|
+
jobId: state.jobId,
|
|
1039
|
+
occurredAt: state.startedAt,
|
|
1040
|
+
});
|
|
1041
|
+
return emit(state, event);
|
|
1042
|
+
}
|
|
1043
|
+
async function* runSourcesWithProgress(state, sources) {
|
|
1044
|
+
let identityFailure;
|
|
1045
|
+
for (const source of sources) {
|
|
1046
|
+
if (cancellationRequested(state) || identityFailure !== undefined) {
|
|
1047
|
+
break;
|
|
1048
|
+
}
|
|
1049
|
+
identityFailure = yield* iterateSourceEvents(state, source);
|
|
1050
|
+
persistJobProgress(state);
|
|
1051
|
+
}
|
|
1052
|
+
return identityFailure;
|
|
1053
|
+
}
|
|
1054
|
+
// ─── Public entrypoint ────────────────────────────────────────────────────────
|
|
1055
|
+
export async function* runIndexingJob(options) {
|
|
1056
|
+
const capsule = resolveCapsule(options);
|
|
1057
|
+
const sources = resolveSources(options, capsule);
|
|
1058
|
+
const startedAt = (options.now ?? options.store._internal.now)();
|
|
1059
|
+
const idSource = options.idSource ?? (() => randomUUID());
|
|
1060
|
+
const jobId = idSource();
|
|
1061
|
+
const state = buildInitialState(options, capsule, sources, jobId, startedAt);
|
|
1062
|
+
persistStartedJob(state, sources);
|
|
1063
|
+
yield emitJobStarted(state, sources);
|
|
1064
|
+
if (cancellationRequested(state)) {
|
|
1065
|
+
yield* finalize(state, undefined);
|
|
1066
|
+
return;
|
|
1067
|
+
}
|
|
1068
|
+
const preflightFailure = await verifyEmbeddingPreflight(state);
|
|
1069
|
+
if (cancellationRequested(state)) {
|
|
1070
|
+
yield* finalize(state, undefined);
|
|
1071
|
+
return;
|
|
1072
|
+
}
|
|
1073
|
+
if (preflightFailure !== undefined) {
|
|
1074
|
+
state.lastError = preflightFailure;
|
|
1075
|
+
yield* finalize(state, preflightFailure);
|
|
1076
|
+
return;
|
|
1077
|
+
}
|
|
1078
|
+
const identityFailure = yield* runSourcesWithProgress(state, sources);
|
|
1079
|
+
yield* finalize(state, identityFailure);
|
|
1080
|
+
}
|
|
1081
|
+
// Drains one source's event stream, yielding each event to the outer generator.
|
|
1082
|
+
// Returns the identity-failure error if encountered, undefined otherwise.
|
|
1083
|
+
async function* iterateSourceEvents(state, source) {
|
|
1084
|
+
for await (const evt of runOneSource(state, source)) {
|
|
1085
|
+
yield emit(state, evt);
|
|
1086
|
+
if (evt.kind === "document-failed" && evt.error.code === "INCOMPATIBLE_EMBEDDING_IDENTITY") {
|
|
1087
|
+
return evt.error;
|
|
1088
|
+
}
|
|
1089
|
+
}
|
|
1090
|
+
return undefined;
|
|
1091
|
+
}
|
|
1092
|
+
function emit(state, event) {
|
|
1093
|
+
emitProgress(state.options, event);
|
|
1094
|
+
return event;
|
|
1095
|
+
}
|
|
1096
|
+
function resolveJobStatus(state, fatalFailure) {
|
|
1097
|
+
if (fatalFailure !== undefined) {
|
|
1098
|
+
state.lastError = fatalFailure;
|
|
1099
|
+
return "failed";
|
|
1100
|
+
}
|
|
1101
|
+
if (cancellationRequested(state))
|
|
1102
|
+
return "cancelled";
|
|
1103
|
+
if (state.failedDocuments > 0 && state.processedDocuments === 0)
|
|
1104
|
+
return "failed";
|
|
1105
|
+
return "succeeded";
|
|
1106
|
+
}
|
|
1107
|
+
function* finalize(state, fatalFailure) {
|
|
1108
|
+
const finishedAt = state.now();
|
|
1109
|
+
const status = resolveJobStatus(state, fatalFailure);
|
|
1110
|
+
finalizeJobRow(state.options.store._internal.db, {
|
|
1111
|
+
id: state.jobId,
|
|
1112
|
+
status,
|
|
1113
|
+
finishedAt,
|
|
1114
|
+
counters: buildCounters(state),
|
|
1115
|
+
...(state.lastError !== undefined ? { lastError: state.lastError } : {}),
|
|
1116
|
+
});
|
|
1117
|
+
try {
|
|
1118
|
+
updateCapsuleState(state.options.store, state.capsule.id, status === "succeeded" ? "ready" : "error");
|
|
1119
|
+
}
|
|
1120
|
+
catch {
|
|
1121
|
+
// informational only — see the started block for the rationale
|
|
1122
|
+
}
|
|
1123
|
+
const result = buildResult(state, status, finishedAt);
|
|
1124
|
+
if (status === "cancelled") {
|
|
1125
|
+
yield emit(state, { kind: "job-cancelled", jobId: state.jobId, result });
|
|
1126
|
+
return;
|
|
1127
|
+
}
|
|
1128
|
+
if (status === "failed") {
|
|
1129
|
+
const err = state.lastError ?? { code: "EMBEDDING_ADAPTER_FAILED", message: "indexing failed" };
|
|
1130
|
+
state.options.auditSink?.emit({
|
|
1131
|
+
kind: "indexing-job-failed",
|
|
1132
|
+
capsuleId: state.capsule.id,
|
|
1133
|
+
sourceIds: sourceIdsForState(state),
|
|
1134
|
+
jobId: state.jobId,
|
|
1135
|
+
errorCode: err.code,
|
|
1136
|
+
occurredAt: finishedAt,
|
|
1137
|
+
});
|
|
1138
|
+
yield emit(state, { kind: "job-failed", jobId: state.jobId, error: err, result });
|
|
1139
|
+
return;
|
|
1140
|
+
}
|
|
1141
|
+
state.options.auditSink?.emit({
|
|
1142
|
+
kind: "indexing-job-completed",
|
|
1143
|
+
capsuleId: state.capsule.id,
|
|
1144
|
+
sourceIds: sourceIdsForState(state),
|
|
1145
|
+
jobId: state.jobId,
|
|
1146
|
+
processedDocuments: result.processedDocuments,
|
|
1147
|
+
failedDocuments: result.failedDocuments,
|
|
1148
|
+
occurredAt: finishedAt,
|
|
1149
|
+
});
|
|
1150
|
+
yield emit(state, { kind: "job-completed", jobId: state.jobId, result });
|
|
1151
|
+
}
|