@oscharko-dev/keiko-local-knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/bounded-document-extraction.d.ts +27 -0
- package/dist/bounded-document-extraction.d.ts.map +1 -0
- package/dist/bounded-document-extraction.js +214 -0
- package/dist/capsule-lifecycle.d.ts +33 -0
- package/dist/capsule-lifecycle.d.ts.map +1 -0
- package/dist/capsule-lifecycle.js +292 -0
- package/dist/capsule-set-lifecycle.d.ts +15 -0
- package/dist/capsule-set-lifecycle.d.ts.map +1 -0
- package/dist/capsule-set-lifecycle.js +158 -0
- package/dist/chunking/chunker-persist.d.ts +36 -0
- package/dist/chunking/chunker-persist.d.ts.map +1 -0
- package/dist/chunking/chunker-persist.js +74 -0
- package/dist/chunking/chunker-runner.d.ts +9 -0
- package/dist/chunking/chunker-runner.d.ts.map +1 -0
- package/dist/chunking/chunker-runner.js +218 -0
- package/dist/chunking/chunker.d.ts +7 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +139 -0
- package/dist/chunking/citation-mapper.d.ts +4 -0
- package/dist/chunking/citation-mapper.d.ts.map +1 -0
- package/dist/chunking/citation-mapper.js +180 -0
- package/dist/chunking/index.d.ts +6 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +8 -0
- package/dist/chunking/token-estimator.d.ts +3 -0
- package/dist/chunking/token-estimator.d.ts.map +1 -0
- package/dist/chunking/token-estimator.js +26 -0
- package/dist/chunking/types.d.ts +49 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +26 -0
- package/dist/composition.d.ts +57 -0
- package/dist/composition.d.ts.map +1 -0
- package/dist/composition.js +310 -0
- package/dist/conversation/citation-attacher.d.ts +8 -0
- package/dist/conversation/citation-attacher.d.ts.map +1 -0
- package/dist/conversation/citation-attacher.js +55 -0
- package/dist/conversation/citation-excerpts.d.ts +4 -0
- package/dist/conversation/citation-excerpts.d.ts.map +1 -0
- package/dist/conversation/citation-excerpts.js +41 -0
- package/dist/conversation/grounded-answer-runner.d.ts +9 -0
- package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
- package/dist/conversation/grounded-answer-runner.js +61 -0
- package/dist/conversation/index.d.ts +5 -0
- package/dist/conversation/index.d.ts.map +1 -0
- package/dist/conversation/index.js +7 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
- package/dist/conversation/model-gateway-answer-generator.js +105 -0
- package/dist/conversation/types.d.ts +35 -0
- package/dist/conversation/types.d.ts.map +1 -0
- package/dist/conversation/types.js +24 -0
- package/dist/discovery/discovery-runner.d.ts +23 -0
- package/dist/discovery/discovery-runner.d.ts.map +1 -0
- package/dist/discovery/discovery-runner.js +109 -0
- package/dist/discovery/extract-progressive.d.ts +17 -0
- package/dist/discovery/extract-progressive.d.ts.map +1 -0
- package/dist/discovery/extract-progressive.js +522 -0
- package/dist/discovery/extract.d.ts +26 -0
- package/dist/discovery/extract.d.ts.map +1 -0
- package/dist/discovery/extract.js +906 -0
- package/dist/discovery/glob.d.ts +10 -0
- package/dist/discovery/glob.d.ts.map +1 -0
- package/dist/discovery/glob.js +72 -0
- package/dist/discovery/index.d.ts +6 -0
- package/dist/discovery/index.d.ts.map +1 -0
- package/dist/discovery/index.js +8 -0
- package/dist/discovery/media-type.d.ts +4 -0
- package/dist/discovery/media-type.d.ts.map +1 -0
- package/dist/discovery/media-type.js +62 -0
- package/dist/discovery/persist.d.ts +63 -0
- package/dist/discovery/persist.d.ts.map +1 -0
- package/dist/discovery/persist.js +345 -0
- package/dist/discovery/test-support.d.ts +16 -0
- package/dist/discovery/test-support.d.ts.map +1 -0
- package/dist/discovery/test-support.js +127 -0
- package/dist/discovery/types.d.ts +63 -0
- package/dist/discovery/types.d.ts.map +1 -0
- package/dist/discovery/types.js +28 -0
- package/dist/discovery/walk.d.ts +12 -0
- package/dist/discovery/walk.d.ts.map +1 -0
- package/dist/discovery/walk.js +302 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +22 -0
- package/dist/evaluations/dimensions.d.ts +14 -0
- package/dist/evaluations/dimensions.d.ts.map +1 -0
- package/dist/evaluations/dimensions.js +191 -0
- package/dist/evaluations/fixtures.d.ts +18 -0
- package/dist/evaluations/fixtures.d.ts.map +1 -0
- package/dist/evaluations/fixtures.js +858 -0
- package/dist/evaluations/index.d.ts +7 -0
- package/dist/evaluations/index.d.ts.map +1 -0
- package/dist/evaluations/index.js +10 -0
- package/dist/evaluations/report.d.ts +3 -0
- package/dist/evaluations/report.d.ts.map +1 -0
- package/dist/evaluations/report.js +31 -0
- package/dist/evaluations/runner-seed.d.ts +12 -0
- package/dist/evaluations/runner-seed.d.ts.map +1 -0
- package/dist/evaluations/runner-seed.js +175 -0
- package/dist/evaluations/runner.d.ts +8 -0
- package/dist/evaluations/runner.d.ts.map +1 -0
- package/dist/evaluations/runner.js +205 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
- package/dist/evaluations/scripted-embedding-adapter.js +163 -0
- package/dist/evaluations/types.d.ts +116 -0
- package/dist/evaluations/types.d.ts.map +1 -0
- package/dist/evaluations/types.js +27 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -0
- package/dist/indexing/bounded-indexing.d.ts +41 -0
- package/dist/indexing/bounded-indexing.d.ts.map +1 -0
- package/dist/indexing/bounded-indexing.js +240 -0
- package/dist/indexing/checkpoint-persist.d.ts +8 -0
- package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
- package/dist/indexing/checkpoint-persist.js +135 -0
- package/dist/indexing/checkpoint-resume.d.ts +20 -0
- package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
- package/dist/indexing/checkpoint-resume.js +50 -0
- package/dist/indexing/embedding-batcher.d.ts +3 -0
- package/dist/indexing/embedding-batcher.d.ts.map +1 -0
- package/dist/indexing/embedding-batcher.js +390 -0
- package/dist/indexing/index.d.ts +7 -0
- package/dist/indexing/index.d.ts.map +1 -0
- package/dist/indexing/index.js +11 -0
- package/dist/indexing/job-persist.d.ts +46 -0
- package/dist/indexing/job-persist.d.ts.map +1 -0
- package/dist/indexing/job-persist.js +157 -0
- package/dist/indexing/job-resume.d.ts +4 -0
- package/dist/indexing/job-resume.d.ts.map +1 -0
- package/dist/indexing/job-resume.js +14 -0
- package/dist/indexing/orchestrator.d.ts +3 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +1151 -0
- package/dist/indexing/types.d.ts +156 -0
- package/dist/indexing/types.d.ts.map +1 -0
- package/dist/indexing/types.js +30 -0
- package/dist/indexing/vector-persist.d.ts +32 -0
- package/dist/indexing/vector-persist.d.ts.map +1 -0
- package/dist/indexing/vector-persist.js +105 -0
- package/dist/parsers/_internal.d.ts +20 -0
- package/dist/parsers/_internal.d.ts.map +1 -0
- package/dist/parsers/_internal.js +122 -0
- package/dist/parsers/csv-parser.d.ts +3 -0
- package/dist/parsers/csv-parser.d.ts.map +1 -0
- package/dist/parsers/csv-parser.js +202 -0
- package/dist/parsers/docx-parser.d.ts +3 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +390 -0
- package/dist/parsers/html-parser.d.ts +3 -0
- package/dist/parsers/html-parser.d.ts.map +1 -0
- package/dist/parsers/html-parser.js +310 -0
- package/dist/parsers/index.d.ts +15 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/index.js +41 -0
- package/dist/parsers/json-parser.d.ts +3 -0
- package/dist/parsers/json-parser.d.ts.map +1 -0
- package/dist/parsers/json-parser.js +192 -0
- package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
- package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
- package/dist/parsers/large-document/capability-discovery.js +76 -0
- package/dist/parsers/large-document/diagnostics.d.ts +3 -0
- package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
- package/dist/parsers/large-document/diagnostics.js +11 -0
- package/dist/parsers/large-document/index.d.ts +15 -0
- package/dist/parsers/large-document/index.d.ts.map +1 -0
- package/dist/parsers/large-document/index.js +10 -0
- package/dist/parsers/large-document/legacy-format.d.ts +5 -0
- package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
- package/dist/parsers/large-document/legacy-format.js +25 -0
- package/dist/parsers/large-document/preflight.d.ts +9 -0
- package/dist/parsers/large-document/preflight.d.ts.map +1 -0
- package/dist/parsers/large-document/preflight.js +43 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-extraction.js +123 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-pdf.js +145 -0
- package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
- package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
- package/dist/parsers/large-document/synthetic-source.js +101 -0
- package/dist/parsers/large-document/window-builder.d.ts +24 -0
- package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
- package/dist/parsers/large-document/window-builder.js +75 -0
- package/dist/parsers/ocr/index.d.ts +4 -0
- package/dist/parsers/ocr/index.d.ts.map +1 -0
- package/dist/parsers/ocr/index.js +4 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
- package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
- package/dist/parsers/ocr/types.d.ts +16 -0
- package/dist/parsers/ocr/types.d.ts.map +1 -0
- package/dist/parsers/ocr/types.js +4 -0
- package/dist/parsers/parser-test-fixtures.d.ts +28 -0
- package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
- package/dist/parsers/parser-test-fixtures.js +139 -0
- package/dist/parsers/pdf-parser.d.ts +43 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +388 -0
- package/dist/parsers/registry.d.ts +8 -0
- package/dist/parsers/registry.d.ts.map +1 -0
- package/dist/parsers/registry.js +57 -0
- package/dist/parsers/text-parser.d.ts +3 -0
- package/dist/parsers/text-parser.d.ts.map +1 -0
- package/dist/parsers/text-parser.js +214 -0
- package/dist/parsers/types.d.ts +53 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers/types.js +21 -0
- package/dist/parsers/unsupported-parser.d.ts +4 -0
- package/dist/parsers/unsupported-parser.d.ts.map +1 -0
- package/dist/parsers/unsupported-parser.js +97 -0
- package/dist/parsers/xlsx-parser.d.ts +3 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +425 -0
- package/dist/privacy/audit-emitter.d.ts +5 -0
- package/dist/privacy/audit-emitter.d.ts.map +1 -0
- package/dist/privacy/audit-emitter.js +93 -0
- package/dist/privacy/diagnostic-redactor.d.ts +2 -0
- package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
- package/dist/privacy/diagnostic-redactor.js +153 -0
- package/dist/privacy/index.d.ts +5 -0
- package/dist/privacy/index.d.ts.map +1 -0
- package/dist/privacy/index.js +6 -0
- package/dist/privacy/retention-applier.d.ts +5 -0
- package/dist/privacy/retention-applier.d.ts.map +1 -0
- package/dist/privacy/retention-applier.js +88 -0
- package/dist/privacy/types.d.ts +98 -0
- package/dist/privacy/types.d.ts.map +1 -0
- package/dist/privacy/types.js +12 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
- package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
- package/dist/qualityIntelligence/index.d.ts +3 -0
- package/dist/qualityIntelligence/index.d.ts.map +1 -0
- package/dist/qualityIntelligence/index.js +5 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
- package/dist/qualityIntelligence/qiHandoff.js +82 -0
- package/dist/retrieval/answer-grounding.d.ts +9 -0
- package/dist/retrieval/answer-grounding.d.ts.map +1 -0
- package/dist/retrieval/answer-grounding.js +31 -0
- package/dist/retrieval/context-pack-assembler.d.ts +24 -0
- package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
- package/dist/retrieval/context-pack-assembler.js +50 -0
- package/dist/retrieval/index.d.ts +6 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/retrieval/index.js +9 -0
- package/dist/retrieval/retrieval-runner.d.ts +10 -0
- package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
- package/dist/retrieval/retrieval-runner.js +163 -0
- package/dist/retrieval/scoped-vector-search.d.ts +24 -0
- package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
- package/dist/retrieval/scoped-vector-search.js +864 -0
- package/dist/retrieval/types.d.ts +28 -0
- package/dist/retrieval/types.d.ts.map +1 -0
- package/dist/retrieval/types.js +33 -0
- package/dist/section-path-hash.d.ts +3 -0
- package/dist/section-path-hash.d.ts.map +1 -0
- package/dist/section-path-hash.js +9 -0
- package/dist/source-lifecycle.d.ts +14 -0
- package/dist/source-lifecycle.d.ts.map +1 -0
- package/dist/source-lifecycle.js +155 -0
- package/dist/source-routing-validation.d.ts +11 -0
- package/dist/source-routing-validation.d.ts.map +1 -0
- package/dist/source-routing-validation.js +140 -0
- package/dist/store-content-cipher.d.ts +11 -0
- package/dist/store-content-cipher.d.ts.map +1 -0
- package/dist/store-content-cipher.js +67 -0
- package/dist/store-content-encryption.d.ts +12 -0
- package/dist/store-content-encryption.d.ts.map +1 -0
- package/dist/store-content-encryption.js +275 -0
- package/dist/store-paths.d.ts +6 -0
- package/dist/store-paths.d.ts.map +1 -0
- package/dist/store-paths.js +61 -0
- package/dist/store.d.ts +30 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +219 -0
- package/dist/testing.d.ts +47 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +170 -0
- package/dist/version.d.ts +2 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +4 -0
- package/package.json +43 -0
|
@@ -0,0 +1,522 @@
|
|
|
1
|
+
// Progressive large-document extraction wiring (Epic #1160, Issue #1286).
|
|
2
|
+
//
|
|
3
|
+
// Routes a preflight-classified large document through the page-windowed ProgressiveExtractor and
|
|
4
|
+
// persists each window in its own transaction so the JS working set never holds the whole document
|
|
5
|
+
// text and an interrupted job leaves durable, resumable progress. Extracted text is stored as
|
|
6
|
+
// bounded per-window rows (document_text_windows) instead of a single document_texts column; the
|
|
7
|
+
// orchestrator's bounded chunk/embed pass reads it back through readDocumentTextSpan.
|
|
8
|
+
//
|
|
9
|
+
// Extraction resumes from the durable page/text-window checkpoint when the source, parser, policy,
|
|
10
|
+
// chunking, and embedding fingerprint remain compatible. Chunking/embedding resume still happens
|
|
11
|
+
// downstream through chunk/vector coverage reconciliation.
|
|
12
|
+
import { checkpointCompatibility, LARGE_DOCUMENT_DIAGNOSTIC_CODES, largeDocumentPolicyFingerprint, } from "@oscharko-dev/keiko-contracts";
|
|
13
|
+
import { getCapsule } from "../capsule-lifecycle.js";
|
|
14
|
+
import { runProgressiveExtraction, } from "../parsers/index.js";
|
|
15
|
+
import { selectExtractionCheckpoint, upsertExtractionCheckpoint, } from "../indexing/checkpoint-persist.js";
|
|
16
|
+
import { basenameOf, extensionOf, mediaTypeFor } from "./media-type.js";
|
|
17
|
+
import { deleteDependentRows, insertDiagnosticRow, insertDocumentRow, insertDocumentTextWindowRow, insertPageRow, insertParsedUnitRow, updateDocumentStatusRow, } from "./persist.js";
|
|
18
|
+
import { documentIdFor } from "./types.js";
|
|
19
|
+
import { redactDiagnosticMessage } from "../privacy/diagnostic-redactor.js";
|
|
20
|
+
export function selectProgressiveExtractor(context, extension, mediaType) {
|
|
21
|
+
return context.extractors.find((e) => e.matches({ extension, mediaType }));
|
|
22
|
+
}
|
|
23
|
+
function checkpointFor(deps, state, phase, coverage, diagnostics) {
|
|
24
|
+
const at = deps.now();
|
|
25
|
+
upsertExtractionCheckpoint(deps.store._internal.db, {
|
|
26
|
+
capsuleId: deps.capsuleId,
|
|
27
|
+
documentId: deps.documentId,
|
|
28
|
+
jobId: deps.jobId,
|
|
29
|
+
strategy: deps.strategy,
|
|
30
|
+
phase,
|
|
31
|
+
pageCursor: state.pageCursor,
|
|
32
|
+
sectionCursor: 0,
|
|
33
|
+
objectCursor: state.objectCursor,
|
|
34
|
+
extractedTextBytes: state.extractedTextBytes,
|
|
35
|
+
chunkCursor: 0,
|
|
36
|
+
embeddedChunkCursor: 0,
|
|
37
|
+
retryCount: deps.retryCount,
|
|
38
|
+
coverage,
|
|
39
|
+
fingerprint: deps.fingerprint,
|
|
40
|
+
terminalDiagnostics: diagnostics,
|
|
41
|
+
createdAt: deps.createdAt,
|
|
42
|
+
updatedAt: at,
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
// Persists one window in its own transaction: pages + parsed units + the bounded text window, then
|
|
46
|
+
// advances the checkpoint. Never retains more than this window's text.
|
|
47
|
+
function persistWindow(deps, state, window) {
|
|
48
|
+
const db = deps.store._internal.db;
|
|
49
|
+
db.exec("BEGIN");
|
|
50
|
+
try {
|
|
51
|
+
for (const page of window.pages)
|
|
52
|
+
insertPageRow(db, deps.capsuleId, page);
|
|
53
|
+
for (const unit of window.units) {
|
|
54
|
+
insertParsedUnitRow(db, deps.store._internal.contentCipher, deps.capsuleId, `${String(deps.documentId)}#u${String(state.unitIndex)}`, unit);
|
|
55
|
+
state.unitIndex += 1;
|
|
56
|
+
}
|
|
57
|
+
insertDocumentTextWindowRow(db, deps.store._internal.contentCipher, {
|
|
58
|
+
capsuleId: deps.capsuleId,
|
|
59
|
+
documentId: deps.documentId,
|
|
60
|
+
windowIndex: state.windowIndex,
|
|
61
|
+
characterStart: window.characterStart,
|
|
62
|
+
characterEnd: window.characterStart + window.text.length,
|
|
63
|
+
normalizedText: window.text,
|
|
64
|
+
});
|
|
65
|
+
state.windowIndex += 1;
|
|
66
|
+
state.pageCursor = window.lastPageNumber;
|
|
67
|
+
state.objectCursor = window.objectCursor;
|
|
68
|
+
state.characterCursor = window.characterStart + window.text.length;
|
|
69
|
+
state.extractedTextBytes += Buffer.byteLength(window.text, "utf8");
|
|
70
|
+
checkpointFor(deps, state, "extracting", "partial", []);
|
|
71
|
+
db.exec("COMMIT");
|
|
72
|
+
}
|
|
73
|
+
catch (cause) {
|
|
74
|
+
db.exec("ROLLBACK");
|
|
75
|
+
throw cause;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
function persistDiagnostics(deps, diagnostics) {
|
|
79
|
+
const db = deps.store._internal.db;
|
|
80
|
+
diagnostics.forEach((diagnostic, index) => {
|
|
81
|
+
insertDiagnosticRow(db, {
|
|
82
|
+
id: `${String(deps.documentId)}#d${String(index)}`,
|
|
83
|
+
capsuleId: deps.capsuleId,
|
|
84
|
+
diagnostic: { ...diagnostic, documentId: deps.documentId },
|
|
85
|
+
createdAt: deps.now(),
|
|
86
|
+
});
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
function documentStatusFor(summary) {
|
|
90
|
+
if (summary.stopReason === "cancelled")
|
|
91
|
+
return "pending";
|
|
92
|
+
return summary.pageCount > 0 ? "extracted" : "unsupported";
|
|
93
|
+
}
|
|
94
|
+
function finalizePhaseFor(summary) {
|
|
95
|
+
if (summary.stopReason === "cancelled")
|
|
96
|
+
return "cancelled";
|
|
97
|
+
return "extracted";
|
|
98
|
+
}
|
|
99
|
+
// Finalizes a progressive extraction: updates the document status, persists redacted diagnostics,
|
|
100
|
+
// and writes the terminal extraction checkpoint with the measured coverage.
|
|
101
|
+
function finalize(deps, state, summary, diagnostics) {
|
|
102
|
+
const db = deps.store._internal.db;
|
|
103
|
+
const status = documentStatusFor(summary);
|
|
104
|
+
db.exec("BEGIN");
|
|
105
|
+
try {
|
|
106
|
+
updateDocumentStatusRow(db, deps.capsuleId, deps.documentId, status);
|
|
107
|
+
persistDiagnostics(deps, diagnostics);
|
|
108
|
+
checkpointFor(deps, state, finalizePhaseFor(summary), summary.coverage, diagnostics);
|
|
109
|
+
db.exec("COMMIT");
|
|
110
|
+
}
|
|
111
|
+
catch (cause) {
|
|
112
|
+
db.exec("ROLLBACK");
|
|
113
|
+
throw cause;
|
|
114
|
+
}
|
|
115
|
+
return status;
|
|
116
|
+
}
|
|
117
|
+
function buildFingerprint(context, extractor, contentHash, embeddingIdentity) {
|
|
118
|
+
return {
|
|
119
|
+
sourceContentHash: contentHash,
|
|
120
|
+
parserVersion: `${extractor.strategyId}@${extractor.parserVersion}`,
|
|
121
|
+
policyFingerprint: largeDocumentPolicyFingerprint(context.policy),
|
|
122
|
+
chunkingStrategyVersion: context.chunkingStrategyVersion,
|
|
123
|
+
embeddingIdentity,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
function parserDiagnostic(code, message, documentId, severity = "warning") {
|
|
127
|
+
return { code, message, severity, documentId };
|
|
128
|
+
}
|
|
129
|
+
function redactionPrefixFor(params) {
|
|
130
|
+
const { scope } = params.source;
|
|
131
|
+
if (scope.kind === "folder")
|
|
132
|
+
return scope.rootPath;
|
|
133
|
+
if (scope.kind === "repository")
|
|
134
|
+
return scope.repositoryRoot;
|
|
135
|
+
return scope.rootPath;
|
|
136
|
+
}
|
|
137
|
+
function redactDiagnostic(diagnostic, params) {
|
|
138
|
+
return {
|
|
139
|
+
...diagnostic,
|
|
140
|
+
message: redactDiagnosticMessage(diagnostic.message, redactionPrefixFor(params)),
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
function redactDiagnostics(diagnostics, params) {
|
|
144
|
+
return diagnostics.map((diagnostic) => redactDiagnostic(diagnostic, params));
|
|
145
|
+
}
|
|
146
|
+
function checkpointIncompatibleDiagnostic(documentId, reasons) {
|
|
147
|
+
return parserDiagnostic(LARGE_DOCUMENT_DIAGNOSTIC_CODES.CHECKPOINT_INCOMPATIBLE, `resume refused and restarted; changed: ${reasons.join(", ")}`, documentId, "warning");
|
|
148
|
+
}
|
|
149
|
+
function retryLimitDiagnostic(documentId) {
|
|
150
|
+
return parserDiagnostic(LARGE_DOCUMENT_DIAGNOSTIC_CODES.RESOURCE_POLICY_EXCEEDED, "large-document retry count exceeded the configured resource policy", documentId, "error");
|
|
151
|
+
}
|
|
152
|
+
function parserFailureDiagnostic(documentId) {
|
|
153
|
+
return parserDiagnostic("MALFORMED_INPUT", "progressive parser rejected malformed or unsupported document", documentId, "error");
|
|
154
|
+
}
|
|
155
|
+
function multimodalCapabilityDiagnostic(documentId) {
|
|
156
|
+
return parserDiagnostic(LARGE_DOCUMENT_DIAGNOSTIC_CODES.MULTIMODAL_CAPABILITY_UNAVAILABLE, "page image and diagram semantics were not extracted because no multimodal capability is configured; indexed with text-only coverage", documentId, "warning");
|
|
157
|
+
}
|
|
158
|
+
function capabilityDiagnostics(context, params, documentId, summary) {
|
|
159
|
+
if (summary.pageCount === 0)
|
|
160
|
+
return [];
|
|
161
|
+
if (mediaTypeFor(extensionOf(params.file.relativePath)) !== "application/pdf")
|
|
162
|
+
return [];
|
|
163
|
+
if (context.capabilities.multimodal === "available")
|
|
164
|
+
return [];
|
|
165
|
+
return [multimodalCapabilityDiagnostic(documentId)];
|
|
166
|
+
}
|
|
167
|
+
function coverageWithDiagnostics(base, diagnostics) {
|
|
168
|
+
if (diagnostics.some((diagnostic) => diagnostic.code === LARGE_DOCUMENT_DIAGNOSTIC_CODES.PARTIAL_COVERAGE ||
|
|
169
|
+
diagnostic.code === LARGE_DOCUMENT_DIAGNOSTIC_CODES.OCR_CAPABILITY_UNAVAILABLE)) {
|
|
170
|
+
return base === "none" ? "none" : "partial";
|
|
171
|
+
}
|
|
172
|
+
if (base === "complete" &&
|
|
173
|
+
diagnostics.some((diagnostic) => diagnostic.code === LARGE_DOCUMENT_DIAGNOSTIC_CODES.MULTIMODAL_CAPABILITY_UNAVAILABLE)) {
|
|
174
|
+
return "text-only";
|
|
175
|
+
}
|
|
176
|
+
return base;
|
|
177
|
+
}
|
|
178
|
+
function countRows(store, table, capsuleId, documentId) {
|
|
179
|
+
const row = store._internal.db
|
|
180
|
+
.prepare(`SELECT COUNT(*) AS n FROM ${table} WHERE capsule_id = :c AND document_id = :d`)
|
|
181
|
+
.get({ c: String(capsuleId), d: String(documentId) });
|
|
182
|
+
return row.n;
|
|
183
|
+
}
|
|
184
|
+
function maxWindowCharacterEnd(store, capsuleId, documentId) {
|
|
185
|
+
const row = store._internal.db
|
|
186
|
+
.prepare("SELECT COALESCE(MAX(character_end), 0) AS n FROM document_text_windows WHERE capsule_id = :c AND document_id = :d")
|
|
187
|
+
.get({ c: String(capsuleId), d: String(documentId) });
|
|
188
|
+
return row.n;
|
|
189
|
+
}
|
|
190
|
+
function progressiveDocumentRecord(params, documentId, extractor, contentHash, status, now) {
|
|
191
|
+
return {
|
|
192
|
+
id: documentId,
|
|
193
|
+
capsuleId: params.capsuleId,
|
|
194
|
+
sourceId: params.source.id,
|
|
195
|
+
documentPath: params.file.relativePath,
|
|
196
|
+
sizeBytes: params.file.sizeBytes,
|
|
197
|
+
mediaType: mediaTypeFor(extensionOf(params.file.relativePath)),
|
|
198
|
+
contentHash,
|
|
199
|
+
parser: extractor.dependencyVersions === undefined
|
|
200
|
+
? { parserId: extractor.strategyId, parserVersion: extractor.parserVersion }
|
|
201
|
+
: {
|
|
202
|
+
parserId: extractor.strategyId,
|
|
203
|
+
parserVersion: extractor.parserVersion,
|
|
204
|
+
dependencyVersions: extractor.dependencyVersions,
|
|
205
|
+
},
|
|
206
|
+
lastExtractedAt: now,
|
|
207
|
+
status,
|
|
208
|
+
safeDisplayName: basenameOf(params.file.relativePath),
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
function progressiveResult(params, document, summary) {
|
|
212
|
+
if (summary.stopReason === "cancelled") {
|
|
213
|
+
return {
|
|
214
|
+
capsuleId: params.capsuleId,
|
|
215
|
+
sourceId: params.source.id,
|
|
216
|
+
relativePath: params.file.relativePath,
|
|
217
|
+
outcome: {
|
|
218
|
+
kind: "failed",
|
|
219
|
+
document,
|
|
220
|
+
error: {
|
|
221
|
+
code: "CANCELLED",
|
|
222
|
+
message: "progressive extraction cancelled with persisted partial progress",
|
|
223
|
+
relativePath: params.file.relativePath,
|
|
224
|
+
},
|
|
225
|
+
},
|
|
226
|
+
diagnostics: summary.diagnostics,
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
return {
|
|
230
|
+
capsuleId: params.capsuleId,
|
|
231
|
+
sourceId: params.source.id,
|
|
232
|
+
relativePath: params.file.relativePath,
|
|
233
|
+
outcome: { kind: "persisted", document },
|
|
234
|
+
diagnostics: summary.diagnostics,
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
// Inserts the document row (pending) and clears prior dependents/windows before the first window so
|
|
238
|
+
// the page/parsed-unit foreign keys resolve and a re-extract is idempotent.
|
|
239
|
+
function insertPendingDocument(sinkDeps, params, extractor, contentHash, state) {
|
|
240
|
+
const db = sinkDeps.store._internal.db;
|
|
241
|
+
db.exec("BEGIN");
|
|
242
|
+
try {
|
|
243
|
+
insertDocumentRow(db, {
|
|
244
|
+
id: sinkDeps.documentId,
|
|
245
|
+
capsuleId: params.capsuleId,
|
|
246
|
+
sourceId: sinkDeps.sourceId,
|
|
247
|
+
documentPath: params.file.relativePath,
|
|
248
|
+
sizeBytes: params.file.sizeBytes,
|
|
249
|
+
mediaType: mediaTypeFor(extensionOf(params.file.relativePath)),
|
|
250
|
+
contentHash,
|
|
251
|
+
parserId: extractor.strategyId,
|
|
252
|
+
parserVersion: extractor.parserVersion,
|
|
253
|
+
lastExtractedAt: sinkDeps.now(),
|
|
254
|
+
status: "pending",
|
|
255
|
+
safeDisplayName: basenameOf(params.file.relativePath),
|
|
256
|
+
});
|
|
257
|
+
deleteDependentRows(db, params.capsuleId, sinkDeps.documentId);
|
|
258
|
+
checkpointFor(sinkDeps, state, "extracting", "none", []);
|
|
259
|
+
db.exec("COMMIT");
|
|
260
|
+
}
|
|
261
|
+
catch (cause) {
|
|
262
|
+
db.exec("ROLLBACK");
|
|
263
|
+
throw cause;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
function persistStandaloneDiagnostic(deps, diagnostic, suffix) {
|
|
267
|
+
insertDiagnosticRow(deps.store._internal.db, {
|
|
268
|
+
id: `${String(deps.documentId)}#${suffix}`,
|
|
269
|
+
capsuleId: deps.capsuleId,
|
|
270
|
+
diagnostic,
|
|
271
|
+
createdAt: deps.now(),
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
function insertProgressiveFailureRows(deps, params, extractor, contentHash, state, diagnostic) {
|
|
275
|
+
const db = deps.store._internal.db;
|
|
276
|
+
const now = deps.now();
|
|
277
|
+
const redactedDiagnostic = redactDiagnostic(diagnostic, params);
|
|
278
|
+
db.exec("BEGIN");
|
|
279
|
+
try {
|
|
280
|
+
insertDocumentRow(db, {
|
|
281
|
+
id: deps.documentId,
|
|
282
|
+
capsuleId: params.capsuleId,
|
|
283
|
+
sourceId: deps.sourceId,
|
|
284
|
+
documentPath: params.file.relativePath,
|
|
285
|
+
sizeBytes: params.file.sizeBytes,
|
|
286
|
+
mediaType: mediaTypeFor(extensionOf(params.file.relativePath)),
|
|
287
|
+
contentHash,
|
|
288
|
+
parserId: extractor.strategyId,
|
|
289
|
+
parserVersion: extractor.parserVersion,
|
|
290
|
+
lastExtractedAt: now,
|
|
291
|
+
status: "failed",
|
|
292
|
+
safeDisplayName: basenameOf(params.file.relativePath),
|
|
293
|
+
});
|
|
294
|
+
insertDiagnosticRow(db, {
|
|
295
|
+
id: `${String(deps.documentId)}#progressive-failure`,
|
|
296
|
+
capsuleId: params.capsuleId,
|
|
297
|
+
diagnostic: redactedDiagnostic,
|
|
298
|
+
createdAt: now,
|
|
299
|
+
});
|
|
300
|
+
checkpointFor(deps, state, "failed", state.pageCursor > 0 ? "partial" : "none", [
|
|
301
|
+
redactedDiagnostic,
|
|
302
|
+
]);
|
|
303
|
+
db.exec("COMMIT");
|
|
304
|
+
}
|
|
305
|
+
catch (cause) {
|
|
306
|
+
db.exec("ROLLBACK");
|
|
307
|
+
throw cause;
|
|
308
|
+
}
|
|
309
|
+
return redactedDiagnostic;
|
|
310
|
+
}
|
|
311
|
+
function progressiveFailureResult(deps, params, extractor, contentHash, diagnostic) {
|
|
312
|
+
const document = progressiveDocumentRecord(params, deps.documentId, extractor, contentHash, "failed", deps.now());
|
|
313
|
+
return {
|
|
314
|
+
capsuleId: params.capsuleId,
|
|
315
|
+
sourceId: params.source.id,
|
|
316
|
+
relativePath: params.file.relativePath,
|
|
317
|
+
outcome: {
|
|
318
|
+
kind: "failed",
|
|
319
|
+
document,
|
|
320
|
+
error: {
|
|
321
|
+
code: "PARSER_FAILED",
|
|
322
|
+
message: diagnostic.message,
|
|
323
|
+
relativePath: params.file.relativePath,
|
|
324
|
+
},
|
|
325
|
+
},
|
|
326
|
+
diagnostics: [diagnostic],
|
|
327
|
+
};
|
|
328
|
+
}
|
|
329
|
+
function persistProgressiveFailure(deps, params, extractor, contentHash, state, diagnostic) {
|
|
330
|
+
const redactedDiagnostic = insertProgressiveFailureRows(deps, params, extractor, contentHash, state, diagnostic);
|
|
331
|
+
return progressiveFailureResult(deps, params, extractor, contentHash, redactedDiagnostic);
|
|
332
|
+
}
|
|
333
|
+
function extractionOptionsFor(params, context, documentId, now, state, resume) {
|
|
334
|
+
const extension = extensionOf(params.file.relativePath);
|
|
335
|
+
return {
|
|
336
|
+
documentId,
|
|
337
|
+
extension,
|
|
338
|
+
mediaType: mediaTypeFor(extension),
|
|
339
|
+
policy: context.policy,
|
|
340
|
+
now,
|
|
341
|
+
...(context.signal === undefined ? {} : { signal: context.signal }),
|
|
342
|
+
...(resume
|
|
343
|
+
? {
|
|
344
|
+
resumeFromPage: state.pageCursor,
|
|
345
|
+
resumeCharacterStart: state.characterCursor,
|
|
346
|
+
resumeWindowIndex: state.windowIndex,
|
|
347
|
+
resumeObjectCursor: state.objectCursor,
|
|
348
|
+
resumeExtractedTextBytes: state.extractedTextBytes,
|
|
349
|
+
}
|
|
350
|
+
: {}),
|
|
351
|
+
};
|
|
352
|
+
}
|
|
353
|
+
function buildSinkDeps(deps, params, context, extractor, documentId, contentHash, retryCount, createdAt) {
|
|
354
|
+
const capsule = getCapsule(deps.store, params.capsuleId);
|
|
355
|
+
if (capsule === undefined) {
|
|
356
|
+
throw new Error(`progressive extraction requires an existing capsule: ${String(params.capsuleId)}`);
|
|
357
|
+
}
|
|
358
|
+
return {
|
|
359
|
+
store: deps.store,
|
|
360
|
+
capsuleId: params.capsuleId,
|
|
361
|
+
sourceId: String(params.source.id),
|
|
362
|
+
documentId,
|
|
363
|
+
jobId: context.jobId,
|
|
364
|
+
strategy: extractor.strategyId,
|
|
365
|
+
fingerprint: buildFingerprint(context, extractor, contentHash, capsule.embeddingModelIdentity),
|
|
366
|
+
retryCount,
|
|
367
|
+
createdAt,
|
|
368
|
+
now: deps.store._internal.now,
|
|
369
|
+
};
|
|
370
|
+
}
|
|
371
|
+
function emptySinkState() {
|
|
372
|
+
return {
|
|
373
|
+
unitIndex: 0,
|
|
374
|
+
windowIndex: 0,
|
|
375
|
+
pageCursor: 0,
|
|
376
|
+
objectCursor: 0,
|
|
377
|
+
characterCursor: 0,
|
|
378
|
+
extractedTextBytes: 0,
|
|
379
|
+
};
|
|
380
|
+
}
|
|
381
|
+
function checkpointSinkState(store, params, documentId, checkpoint) {
|
|
382
|
+
return {
|
|
383
|
+
unitIndex: countRows(store, "parsed_units", params.capsuleId, documentId),
|
|
384
|
+
windowIndex: countRows(store, "document_text_windows", params.capsuleId, documentId),
|
|
385
|
+
pageCursor: checkpoint?.pageCursor ?? 0,
|
|
386
|
+
objectCursor: checkpoint?.objectCursor ?? 0,
|
|
387
|
+
characterCursor: maxWindowCharacterEnd(store, params.capsuleId, documentId),
|
|
388
|
+
extractedTextBytes: checkpoint?.extractedTextBytes ?? 0,
|
|
389
|
+
};
|
|
390
|
+
}
|
|
391
|
+
function planForMissingCheckpoint(createdAt) {
|
|
392
|
+
return {
|
|
393
|
+
state: emptySinkState(),
|
|
394
|
+
retryCount: 0,
|
|
395
|
+
createdAt,
|
|
396
|
+
resume: false,
|
|
397
|
+
startupDiagnostics: [],
|
|
398
|
+
};
|
|
399
|
+
}
|
|
400
|
+
function retryExceededPlan(deps, params, context, extractor, contentHash, provisional, existing, state, retryCount) {
|
|
401
|
+
const sinkDeps = buildSinkDeps(deps, params, context, extractor, provisional.documentId, contentHash, retryCount, existing.createdAt);
|
|
402
|
+
return {
|
|
403
|
+
state,
|
|
404
|
+
retryCount,
|
|
405
|
+
createdAt: existing.createdAt,
|
|
406
|
+
resume: false,
|
|
407
|
+
startupDiagnostics: [],
|
|
408
|
+
failure: persistProgressiveFailure(sinkDeps, params, extractor, contentHash, state, retryLimitDiagnostic(provisional.documentId)),
|
|
409
|
+
};
|
|
410
|
+
}
|
|
411
|
+
function planForExistingCheckpoint(deps, params, context, extractor, contentHash, provisional, existing) {
|
|
412
|
+
const compat = checkpointCompatibility(existing.fingerprint, provisional.fingerprint);
|
|
413
|
+
if (!compat.compatible) {
|
|
414
|
+
return {
|
|
415
|
+
...planForMissingCheckpoint(provisional.createdAt),
|
|
416
|
+
startupDiagnostics: [
|
|
417
|
+
checkpointIncompatibleDiagnostic(provisional.documentId, compat.reasons),
|
|
418
|
+
],
|
|
419
|
+
};
|
|
420
|
+
}
|
|
421
|
+
if ((existing.phase !== "extracting" && existing.phase !== "cancelled") ||
|
|
422
|
+
existing.pageCursor <= 0) {
|
|
423
|
+
return planForMissingCheckpoint(provisional.createdAt);
|
|
424
|
+
}
|
|
425
|
+
const retryCount = existing.retryCount + 1;
|
|
426
|
+
const state = checkpointSinkState(deps.store, params, provisional.documentId, existing);
|
|
427
|
+
if (retryCount > context.policy.maxRetryCount) {
|
|
428
|
+
return retryExceededPlan(deps, params, context, extractor, contentHash, provisional, existing, state, retryCount);
|
|
429
|
+
}
|
|
430
|
+
return { state, retryCount, createdAt: existing.createdAt, resume: true, startupDiagnostics: [] };
|
|
431
|
+
}
|
|
432
|
+
function persistProgressiveStartup(sinkDeps, params, extractor, contentHash, plan) {
|
|
433
|
+
if (plan.resume) {
|
|
434
|
+
checkpointFor(sinkDeps, plan.state, "extracting", "partial", plan.startupDiagnostics);
|
|
435
|
+
return;
|
|
436
|
+
}
|
|
437
|
+
insertPendingDocument(sinkDeps, params, extractor, contentHash, plan.state);
|
|
438
|
+
plan.startupDiagnostics.forEach((diagnostic, index) => {
|
|
439
|
+
persistStandaloneDiagnostic(sinkDeps, redactDiagnostic(diagnostic, params), `startup-d${String(index)}`);
|
|
440
|
+
});
|
|
441
|
+
if (plan.startupDiagnostics.length > 0) {
|
|
442
|
+
checkpointFor(sinkDeps, plan.state, "extracting", "none", redactDiagnostics(plan.startupDiagnostics, params));
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
async function tryRunProgressiveExtraction(extractor, source, options, sinkDeps, state) {
|
|
446
|
+
try {
|
|
447
|
+
return await runProgressiveExtraction(extractor, source, options, {
|
|
448
|
+
onWindow: (window) => {
|
|
449
|
+
persistWindow(sinkDeps, state, window);
|
|
450
|
+
},
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
catch {
|
|
454
|
+
return undefined;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
function summaryWithDiagnostics(context, params, documentId, summary, startupDiagnostics) {
|
|
458
|
+
const diagnostics = redactDiagnostics([
|
|
459
|
+
...startupDiagnostics,
|
|
460
|
+
...summary.diagnostics,
|
|
461
|
+
...capabilityDiagnostics(context, params, documentId, summary),
|
|
462
|
+
], params);
|
|
463
|
+
return {
|
|
464
|
+
...summary,
|
|
465
|
+
diagnostics,
|
|
466
|
+
coverage: coverageWithDiagnostics(summary.coverage, diagnostics),
|
|
467
|
+
};
|
|
468
|
+
}
|
|
469
|
+
function progressiveDocumentId(params, context) {
|
|
470
|
+
return documentIdFor({
|
|
471
|
+
capsuleId: params.capsuleId,
|
|
472
|
+
sourceId: params.source.id,
|
|
473
|
+
relativePath: context.relativePath,
|
|
474
|
+
});
|
|
475
|
+
}
|
|
476
|
+
function planForCheckpoint(deps, params, context, extractor, contentHash, provisional) {
|
|
477
|
+
const existing = selectExtractionCheckpoint(deps.store._internal.db, params.capsuleId, provisional.documentId);
|
|
478
|
+
if (existing === undefined)
|
|
479
|
+
return planForMissingCheckpoint(provisional.createdAt);
|
|
480
|
+
return planForExistingCheckpoint(deps, params, context, extractor, contentHash, provisional, existing);
|
|
481
|
+
}
|
|
482
|
+
function buildProgressiveStart(deps, params, context, contentHash, extractor) {
|
|
483
|
+
const documentId = progressiveDocumentId(params, context);
|
|
484
|
+
const provisionalSinkDeps = buildSinkDeps(deps, params, context, extractor, documentId, contentHash, 0, deps.store._internal.now());
|
|
485
|
+
return {
|
|
486
|
+
documentId,
|
|
487
|
+
provisionalSinkDeps,
|
|
488
|
+
plan: planForCheckpoint(deps, params, context, extractor, contentHash, provisionalSinkDeps),
|
|
489
|
+
};
|
|
490
|
+
}
|
|
491
|
+
function sinkDepsForPlan(deps, params, context, extractor, contentHash, start) {
|
|
492
|
+
const { plan, provisionalSinkDeps } = start;
|
|
493
|
+
if (plan.retryCount === provisionalSinkDeps.retryCount &&
|
|
494
|
+
plan.createdAt === provisionalSinkDeps.createdAt) {
|
|
495
|
+
return provisionalSinkDeps;
|
|
496
|
+
}
|
|
497
|
+
return buildSinkDeps(deps, params, context, extractor, start.documentId, contentHash, plan.retryCount, plan.createdAt);
|
|
498
|
+
}
|
|
499
|
+
function finalizeProgressiveSuccess(sinkDeps, params, context, contentHash, extractor, plan, summary) {
|
|
500
|
+
const finalSummary = summaryWithDiagnostics(context, params, sinkDeps.documentId, summary, plan.startupDiagnostics);
|
|
501
|
+
const status = finalize(sinkDeps, plan.state, finalSummary, finalSummary.diagnostics);
|
|
502
|
+
const document = progressiveDocumentRecord(params, sinkDeps.documentId, extractor, contentHash, status, sinkDeps.now());
|
|
503
|
+
return progressiveResult(params, document, finalSummary);
|
|
504
|
+
}
|
|
505
|
+
async function runPlannedProgressiveExtraction(sinkDeps, params, context, source, contentHash, extractor, plan) {
|
|
506
|
+
persistProgressiveStartup(sinkDeps, params, extractor, contentHash, plan);
|
|
507
|
+
const summary = await tryRunProgressiveExtraction(extractor, source, extractionOptionsFor(params, context, sinkDeps.documentId, sinkDeps.now, plan.state, plan.resume), sinkDeps, plan.state);
|
|
508
|
+
if (summary === undefined) {
|
|
509
|
+
return persistProgressiveFailure(sinkDeps, params, extractor, contentHash, plan.state, parserFailureDiagnostic(sinkDeps.documentId));
|
|
510
|
+
}
|
|
511
|
+
return finalizeProgressiveSuccess(sinkDeps, params, context, contentHash, extractor, plan, summary);
|
|
512
|
+
}
|
|
513
|
+
// Runs the page-windowed progressive extraction for one large document and persists it bounded.
|
|
514
|
+
// The caller supplies a bounded byte source and a hash computed through bounded reads; extracted
|
|
515
|
+
// text, pages, and checkpoints are flushed per window so peak memory does not scale with the raw
|
|
516
|
+
// document size.
|
|
517
|
+
export async function extractDocumentProgressive(deps, params, context, source, contentHash, extractor) {
|
|
518
|
+
const start = buildProgressiveStart(deps, params, context, contentHash, extractor);
|
|
519
|
+
if (start.plan.failure !== undefined)
|
|
520
|
+
return start.plan.failure;
|
|
521
|
+
return await runPlannedProgressiveExtraction(sinkDepsForPlan(deps, params, context, extractor, contentHash, start), params, context, source, contentHash, extractor, start.plan);
|
|
522
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import type { ExtractionCapabilityAvailability, KnowledgeCapsuleId, KnowledgeSource, LargeDocumentResourcePolicy } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
import type { WorkspaceFs } from "@oscharko-dev/keiko-workspace";
|
|
3
|
+
import type { ParserOptions, ParserRegistry, ProgressiveExtractor } from "../parsers/index.js";
|
|
4
|
+
import type { KnowledgeStore } from "../store.js";
|
|
5
|
+
import { type DiscoveredFile, type DiscoveryError, type ExtractionResult } from "./types.js";
|
|
6
|
+
export interface ExtractDocumentParams {
|
|
7
|
+
readonly capsuleId: KnowledgeCapsuleId;
|
|
8
|
+
readonly source: KnowledgeSource;
|
|
9
|
+
readonly file: DiscoveredFile;
|
|
10
|
+
readonly parserOptions?: ParserOptions;
|
|
11
|
+
}
|
|
12
|
+
export interface ExtractDocumentDeps {
|
|
13
|
+
readonly fs: WorkspaceFs;
|
|
14
|
+
readonly store: KnowledgeStore;
|
|
15
|
+
readonly parserRegistry: ParserRegistry;
|
|
16
|
+
readonly largeDocumentPolicy?: LargeDocumentResourcePolicy;
|
|
17
|
+
readonly extractionCapabilities?: ExtractionCapabilityAvailability;
|
|
18
|
+
readonly progressiveExtractors?: readonly ProgressiveExtractor[];
|
|
19
|
+
readonly largeDocumentJobId?: string;
|
|
20
|
+
readonly chunkingStrategyVersion?: string;
|
|
21
|
+
}
|
|
22
|
+
export declare function extractDocument(deps: ExtractDocumentDeps, params: ExtractDocumentParams): Promise<ExtractionResult>;
|
|
23
|
+
export declare function recordExtractionFailure(deps: ExtractDocumentDeps, params: ExtractDocumentParams & {
|
|
24
|
+
readonly error: DiscoveryError;
|
|
25
|
+
}): ExtractionResult;
|
|
26
|
+
//# sourceMappingURL=extract.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract.d.ts","sourceRoot":"","sources":["../../src/discovery/extract.ts"],"names":[],"mappings":"AAkBA,OAAO,KAAK,EAGV,gCAAgC,EAChC,kBAAkB,EAClB,eAAe,EAEf,2BAA2B,EAG5B,MAAM,+BAA+B,CAAC;AAMvC,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,+BAA+B,CAAC;AAGjE,OAAO,KAAK,EAGV,aAAa,EACb,cAAc,EAEd,oBAAoB,EACrB,MAAM,qBAAqB,CAAC;AAmB7B,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAalD,OAAO,EAEL,KAAK,cAAc,EACnB,KAAK,cAAc,EAGnB,KAAK,gBAAgB,EACtB,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,SAAS,EAAE,kBAAkB,CAAC;IACvC,QAAQ,CAAC,MAAM,EAAE,eAAe,CAAC;IACjC,QAAQ,CAAC,IAAI,EAAE,cAAc,CAAC;IAC9B,QAAQ,CAAC,aAAa,CAAC,EAAE,aAAa,CAAC;CACxC;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,EAAE,EAAE,WAAW,CAAC;IACzB,QAAQ,CAAC,KAAK,EAAE,cAAc,CAAC;IAC/B,QAAQ,CAAC,cAAc,EAAE,cAAc,CAAC;IAIxC,QAAQ,CAAC,mBAAmB,CAAC,EAAE,2BAA2B,CAAC;IAC3D,QAAQ,CAAC,sBAAsB,CAAC,EAAE,gCAAgC,CAAC;IAEnE,QAAQ,CAAC,qBAAqB,CAAC,EAAE,SAAS,oBAAoB,EAAE,CAAC;IACjE,QAAQ,CAAC,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAGrC,QAAQ,CAAC,uBAAuB,CAAC,EAAE,MAAM,CAAC;CAC3C;AA8hCD,wBAAsB,eAAe,CACnC,IAAI,EAAE,mBAAmB,EACzB,MAAM,EAAE,qBAAqB,GAC5B,OAAO,CAAC,gBAAgB,CAAC,CAqB3B;AAED,wBAAgB,uBAAuB,CACrC,IAAI,EAAE,mBAAmB,EACzB,MAAM,EAAE,qBAAqB,GAAG;IAAE,QAAQ,CAAC,KAAK,EAAE,cAAc,CAAA;CAAE,GACjE,gBAAgB,CAOlB"}
|