@oscharko-dev/keiko-local-knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/bounded-document-extraction.d.ts +27 -0
- package/dist/bounded-document-extraction.d.ts.map +1 -0
- package/dist/bounded-document-extraction.js +214 -0
- package/dist/capsule-lifecycle.d.ts +33 -0
- package/dist/capsule-lifecycle.d.ts.map +1 -0
- package/dist/capsule-lifecycle.js +292 -0
- package/dist/capsule-set-lifecycle.d.ts +15 -0
- package/dist/capsule-set-lifecycle.d.ts.map +1 -0
- package/dist/capsule-set-lifecycle.js +158 -0
- package/dist/chunking/chunker-persist.d.ts +36 -0
- package/dist/chunking/chunker-persist.d.ts.map +1 -0
- package/dist/chunking/chunker-persist.js +74 -0
- package/dist/chunking/chunker-runner.d.ts +9 -0
- package/dist/chunking/chunker-runner.d.ts.map +1 -0
- package/dist/chunking/chunker-runner.js +218 -0
- package/dist/chunking/chunker.d.ts +7 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +139 -0
- package/dist/chunking/citation-mapper.d.ts +4 -0
- package/dist/chunking/citation-mapper.d.ts.map +1 -0
- package/dist/chunking/citation-mapper.js +180 -0
- package/dist/chunking/index.d.ts +6 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +8 -0
- package/dist/chunking/token-estimator.d.ts +3 -0
- package/dist/chunking/token-estimator.d.ts.map +1 -0
- package/dist/chunking/token-estimator.js +26 -0
- package/dist/chunking/types.d.ts +49 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +26 -0
- package/dist/composition.d.ts +57 -0
- package/dist/composition.d.ts.map +1 -0
- package/dist/composition.js +310 -0
- package/dist/conversation/citation-attacher.d.ts +8 -0
- package/dist/conversation/citation-attacher.d.ts.map +1 -0
- package/dist/conversation/citation-attacher.js +55 -0
- package/dist/conversation/citation-excerpts.d.ts +4 -0
- package/dist/conversation/citation-excerpts.d.ts.map +1 -0
- package/dist/conversation/citation-excerpts.js +41 -0
- package/dist/conversation/grounded-answer-runner.d.ts +9 -0
- package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
- package/dist/conversation/grounded-answer-runner.js +61 -0
- package/dist/conversation/index.d.ts +5 -0
- package/dist/conversation/index.d.ts.map +1 -0
- package/dist/conversation/index.js +7 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
- package/dist/conversation/model-gateway-answer-generator.js +105 -0
- package/dist/conversation/types.d.ts +35 -0
- package/dist/conversation/types.d.ts.map +1 -0
- package/dist/conversation/types.js +24 -0
- package/dist/discovery/discovery-runner.d.ts +23 -0
- package/dist/discovery/discovery-runner.d.ts.map +1 -0
- package/dist/discovery/discovery-runner.js +109 -0
- package/dist/discovery/extract-progressive.d.ts +17 -0
- package/dist/discovery/extract-progressive.d.ts.map +1 -0
- package/dist/discovery/extract-progressive.js +522 -0
- package/dist/discovery/extract.d.ts +26 -0
- package/dist/discovery/extract.d.ts.map +1 -0
- package/dist/discovery/extract.js +906 -0
- package/dist/discovery/glob.d.ts +10 -0
- package/dist/discovery/glob.d.ts.map +1 -0
- package/dist/discovery/glob.js +72 -0
- package/dist/discovery/index.d.ts +6 -0
- package/dist/discovery/index.d.ts.map +1 -0
- package/dist/discovery/index.js +8 -0
- package/dist/discovery/media-type.d.ts +4 -0
- package/dist/discovery/media-type.d.ts.map +1 -0
- package/dist/discovery/media-type.js +62 -0
- package/dist/discovery/persist.d.ts +63 -0
- package/dist/discovery/persist.d.ts.map +1 -0
- package/dist/discovery/persist.js +345 -0
- package/dist/discovery/test-support.d.ts +16 -0
- package/dist/discovery/test-support.d.ts.map +1 -0
- package/dist/discovery/test-support.js +127 -0
- package/dist/discovery/types.d.ts +63 -0
- package/dist/discovery/types.d.ts.map +1 -0
- package/dist/discovery/types.js +28 -0
- package/dist/discovery/walk.d.ts +12 -0
- package/dist/discovery/walk.d.ts.map +1 -0
- package/dist/discovery/walk.js +302 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +22 -0
- package/dist/evaluations/dimensions.d.ts +14 -0
- package/dist/evaluations/dimensions.d.ts.map +1 -0
- package/dist/evaluations/dimensions.js +191 -0
- package/dist/evaluations/fixtures.d.ts +18 -0
- package/dist/evaluations/fixtures.d.ts.map +1 -0
- package/dist/evaluations/fixtures.js +858 -0
- package/dist/evaluations/index.d.ts +7 -0
- package/dist/evaluations/index.d.ts.map +1 -0
- package/dist/evaluations/index.js +10 -0
- package/dist/evaluations/report.d.ts +3 -0
- package/dist/evaluations/report.d.ts.map +1 -0
- package/dist/evaluations/report.js +31 -0
- package/dist/evaluations/runner-seed.d.ts +12 -0
- package/dist/evaluations/runner-seed.d.ts.map +1 -0
- package/dist/evaluations/runner-seed.js +175 -0
- package/dist/evaluations/runner.d.ts +8 -0
- package/dist/evaluations/runner.d.ts.map +1 -0
- package/dist/evaluations/runner.js +205 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
- package/dist/evaluations/scripted-embedding-adapter.js +163 -0
- package/dist/evaluations/types.d.ts +116 -0
- package/dist/evaluations/types.d.ts.map +1 -0
- package/dist/evaluations/types.js +27 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -0
- package/dist/indexing/bounded-indexing.d.ts +41 -0
- package/dist/indexing/bounded-indexing.d.ts.map +1 -0
- package/dist/indexing/bounded-indexing.js +240 -0
- package/dist/indexing/checkpoint-persist.d.ts +8 -0
- package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
- package/dist/indexing/checkpoint-persist.js +135 -0
- package/dist/indexing/checkpoint-resume.d.ts +20 -0
- package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
- package/dist/indexing/checkpoint-resume.js +50 -0
- package/dist/indexing/embedding-batcher.d.ts +3 -0
- package/dist/indexing/embedding-batcher.d.ts.map +1 -0
- package/dist/indexing/embedding-batcher.js +390 -0
- package/dist/indexing/index.d.ts +7 -0
- package/dist/indexing/index.d.ts.map +1 -0
- package/dist/indexing/index.js +11 -0
- package/dist/indexing/job-persist.d.ts +46 -0
- package/dist/indexing/job-persist.d.ts.map +1 -0
- package/dist/indexing/job-persist.js +157 -0
- package/dist/indexing/job-resume.d.ts +4 -0
- package/dist/indexing/job-resume.d.ts.map +1 -0
- package/dist/indexing/job-resume.js +14 -0
- package/dist/indexing/orchestrator.d.ts +3 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +1151 -0
- package/dist/indexing/types.d.ts +156 -0
- package/dist/indexing/types.d.ts.map +1 -0
- package/dist/indexing/types.js +30 -0
- package/dist/indexing/vector-persist.d.ts +32 -0
- package/dist/indexing/vector-persist.d.ts.map +1 -0
- package/dist/indexing/vector-persist.js +105 -0
- package/dist/parsers/_internal.d.ts +20 -0
- package/dist/parsers/_internal.d.ts.map +1 -0
- package/dist/parsers/_internal.js +122 -0
- package/dist/parsers/csv-parser.d.ts +3 -0
- package/dist/parsers/csv-parser.d.ts.map +1 -0
- package/dist/parsers/csv-parser.js +202 -0
- package/dist/parsers/docx-parser.d.ts +3 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +390 -0
- package/dist/parsers/html-parser.d.ts +3 -0
- package/dist/parsers/html-parser.d.ts.map +1 -0
- package/dist/parsers/html-parser.js +310 -0
- package/dist/parsers/index.d.ts +15 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/index.js +41 -0
- package/dist/parsers/json-parser.d.ts +3 -0
- package/dist/parsers/json-parser.d.ts.map +1 -0
- package/dist/parsers/json-parser.js +192 -0
- package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
- package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
- package/dist/parsers/large-document/capability-discovery.js +76 -0
- package/dist/parsers/large-document/diagnostics.d.ts +3 -0
- package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
- package/dist/parsers/large-document/diagnostics.js +11 -0
- package/dist/parsers/large-document/index.d.ts +15 -0
- package/dist/parsers/large-document/index.d.ts.map +1 -0
- package/dist/parsers/large-document/index.js +10 -0
- package/dist/parsers/large-document/legacy-format.d.ts +5 -0
- package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
- package/dist/parsers/large-document/legacy-format.js +25 -0
- package/dist/parsers/large-document/preflight.d.ts +9 -0
- package/dist/parsers/large-document/preflight.d.ts.map +1 -0
- package/dist/parsers/large-document/preflight.js +43 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-extraction.js +123 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-pdf.js +145 -0
- package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
- package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
- package/dist/parsers/large-document/synthetic-source.js +101 -0
- package/dist/parsers/large-document/window-builder.d.ts +24 -0
- package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
- package/dist/parsers/large-document/window-builder.js +75 -0
- package/dist/parsers/ocr/index.d.ts +4 -0
- package/dist/parsers/ocr/index.d.ts.map +1 -0
- package/dist/parsers/ocr/index.js +4 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
- package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
- package/dist/parsers/ocr/types.d.ts +16 -0
- package/dist/parsers/ocr/types.d.ts.map +1 -0
- package/dist/parsers/ocr/types.js +4 -0
- package/dist/parsers/parser-test-fixtures.d.ts +28 -0
- package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
- package/dist/parsers/parser-test-fixtures.js +139 -0
- package/dist/parsers/pdf-parser.d.ts +43 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +388 -0
- package/dist/parsers/registry.d.ts +8 -0
- package/dist/parsers/registry.d.ts.map +1 -0
- package/dist/parsers/registry.js +57 -0
- package/dist/parsers/text-parser.d.ts +3 -0
- package/dist/parsers/text-parser.d.ts.map +1 -0
- package/dist/parsers/text-parser.js +214 -0
- package/dist/parsers/types.d.ts +53 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers/types.js +21 -0
- package/dist/parsers/unsupported-parser.d.ts +4 -0
- package/dist/parsers/unsupported-parser.d.ts.map +1 -0
- package/dist/parsers/unsupported-parser.js +97 -0
- package/dist/parsers/xlsx-parser.d.ts +3 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +425 -0
- package/dist/privacy/audit-emitter.d.ts +5 -0
- package/dist/privacy/audit-emitter.d.ts.map +1 -0
- package/dist/privacy/audit-emitter.js +93 -0
- package/dist/privacy/diagnostic-redactor.d.ts +2 -0
- package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
- package/dist/privacy/diagnostic-redactor.js +153 -0
- package/dist/privacy/index.d.ts +5 -0
- package/dist/privacy/index.d.ts.map +1 -0
- package/dist/privacy/index.js +6 -0
- package/dist/privacy/retention-applier.d.ts +5 -0
- package/dist/privacy/retention-applier.d.ts.map +1 -0
- package/dist/privacy/retention-applier.js +88 -0
- package/dist/privacy/types.d.ts +98 -0
- package/dist/privacy/types.d.ts.map +1 -0
- package/dist/privacy/types.js +12 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
- package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
- package/dist/qualityIntelligence/index.d.ts +3 -0
- package/dist/qualityIntelligence/index.d.ts.map +1 -0
- package/dist/qualityIntelligence/index.js +5 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
- package/dist/qualityIntelligence/qiHandoff.js +82 -0
- package/dist/retrieval/answer-grounding.d.ts +9 -0
- package/dist/retrieval/answer-grounding.d.ts.map +1 -0
- package/dist/retrieval/answer-grounding.js +31 -0
- package/dist/retrieval/context-pack-assembler.d.ts +24 -0
- package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
- package/dist/retrieval/context-pack-assembler.js +50 -0
- package/dist/retrieval/index.d.ts +6 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/retrieval/index.js +9 -0
- package/dist/retrieval/retrieval-runner.d.ts +10 -0
- package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
- package/dist/retrieval/retrieval-runner.js +163 -0
- package/dist/retrieval/scoped-vector-search.d.ts +24 -0
- package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
- package/dist/retrieval/scoped-vector-search.js +864 -0
- package/dist/retrieval/types.d.ts +28 -0
- package/dist/retrieval/types.d.ts.map +1 -0
- package/dist/retrieval/types.js +33 -0
- package/dist/section-path-hash.d.ts +3 -0
- package/dist/section-path-hash.d.ts.map +1 -0
- package/dist/section-path-hash.js +9 -0
- package/dist/source-lifecycle.d.ts +14 -0
- package/dist/source-lifecycle.d.ts.map +1 -0
- package/dist/source-lifecycle.js +155 -0
- package/dist/source-routing-validation.d.ts +11 -0
- package/dist/source-routing-validation.d.ts.map +1 -0
- package/dist/source-routing-validation.js +140 -0
- package/dist/store-content-cipher.d.ts +11 -0
- package/dist/store-content-cipher.d.ts.map +1 -0
- package/dist/store-content-cipher.js +67 -0
- package/dist/store-content-encryption.d.ts +12 -0
- package/dist/store-content-encryption.d.ts.map +1 -0
- package/dist/store-content-encryption.js +275 -0
- package/dist/store-paths.d.ts +6 -0
- package/dist/store-paths.d.ts.map +1 -0
- package/dist/store-paths.js +61 -0
- package/dist/store.d.ts +30 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +219 -0
- package/dist/testing.d.ts +47 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +170 -0
- package/dist/version.d.ts +2 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +4 -0
- package/package.json +43 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
// Capability discovery for optional extraction capabilities (Epic #1160, Issue #1286).
|
|
2
|
+
//
|
|
3
|
+
// OCR and multimodal extraction are optional. They are discovered by probing an injected adapter
|
|
4
|
+
// and classified as available | unavailable | degraded | failing. Missing or failing capabilities
|
|
5
|
+
// degrade retrieval coverage but never compromise pipeline stability: the indexing job still
|
|
6
|
+
// completes deterministically with partial-coverage quality warnings. No third-party OCR/vision
|
|
7
|
+
// service is bundled or required by default; the null adapters report "unavailable".
|
|
8
|
+
export const nullMultimodalAdapter = Object.freeze({
|
|
9
|
+
kind: "multimodal",
|
|
10
|
+
describeImage: () => Promise.resolve({ ok: false, reason: "not-configured" }),
|
|
11
|
+
});
|
|
12
|
+
const DEFAULT_PROBE_TIMEOUT_MS = 5_000;
|
|
13
|
+
async function withTimeout(work, timeoutMs, onTimeout) {
|
|
14
|
+
let timer;
|
|
15
|
+
const timeout = new Promise((resolve) => {
|
|
16
|
+
timer = setTimeout(() => {
|
|
17
|
+
resolve(onTimeout());
|
|
18
|
+
}, timeoutMs);
|
|
19
|
+
});
|
|
20
|
+
try {
|
|
21
|
+
return await Promise.race([work, timeout]);
|
|
22
|
+
}
|
|
23
|
+
finally {
|
|
24
|
+
if (timer !== undefined)
|
|
25
|
+
clearTimeout(timer);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
export async function probeOcrCapability(adapter, timeoutMs) {
|
|
29
|
+
if (adapter === undefined)
|
|
30
|
+
return "unavailable";
|
|
31
|
+
try {
|
|
32
|
+
const result = await withTimeout(adapter.ocrPage({ bytes: new Uint8Array(0), pageNumber: 1 }), timeoutMs, () => ({ ok: false, reason: "timeout" }));
|
|
33
|
+
if (result.ok)
|
|
34
|
+
return "available";
|
|
35
|
+
switch (result.reason) {
|
|
36
|
+
case "ocr-not-configured":
|
|
37
|
+
return "unavailable";
|
|
38
|
+
case "timeout":
|
|
39
|
+
return "degraded";
|
|
40
|
+
case "unsupported-input":
|
|
41
|
+
// The engine is installed but rejected the empty probe input — still configured.
|
|
42
|
+
return "available";
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
catch {
|
|
46
|
+
return "failing";
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
export async function probeMultimodalCapability(adapter, timeoutMs) {
|
|
50
|
+
if (adapter === undefined)
|
|
51
|
+
return "unavailable";
|
|
52
|
+
try {
|
|
53
|
+
const result = await withTimeout(adapter.describeImage({ bytes: new Uint8Array(0), pageNumber: 1 }), timeoutMs, () => ({ ok: false, reason: "timeout" }));
|
|
54
|
+
if (result.ok)
|
|
55
|
+
return "available";
|
|
56
|
+
switch (result.reason) {
|
|
57
|
+
case "not-configured":
|
|
58
|
+
return "unavailable";
|
|
59
|
+
case "timeout":
|
|
60
|
+
return "degraded";
|
|
61
|
+
case "unsupported-input":
|
|
62
|
+
return "available";
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
return "failing";
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
export async function discoverExtractionCapabilities(deps = {}) {
|
|
70
|
+
const timeoutMs = deps.probeTimeoutMs ?? DEFAULT_PROBE_TIMEOUT_MS;
|
|
71
|
+
const [ocr, multimodal] = await Promise.all([
|
|
72
|
+
probeOcrCapability(deps.ocr, timeoutMs),
|
|
73
|
+
probeMultimodalCapability(deps.multimodal, timeoutMs),
|
|
74
|
+
]);
|
|
75
|
+
return { ocr, multimodal };
|
|
76
|
+
}
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import type { DocumentId, LargeDocumentDiagnosticCode, ParserDiagnostic, ParserDiagnosticSeverity } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
export declare function largeDocumentDiagnostic(code: LargeDocumentDiagnosticCode, message: string, documentId: DocumentId, severity?: ParserDiagnosticSeverity, pageNumber?: number): ParserDiagnostic;
|
|
3
|
+
//# sourceMappingURL=diagnostics.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"diagnostics.d.ts","sourceRoot":"","sources":["../../../src/parsers/large-document/diagnostics.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EACV,UAAU,EACV,2BAA2B,EAC3B,gBAAgB,EAChB,wBAAwB,EACzB,MAAM,+BAA+B,CAAC;AAEvC,wBAAgB,uBAAuB,CACrC,IAAI,EAAE,2BAA2B,EACjC,OAAO,EAAE,MAAM,EACf,UAAU,EAAE,UAAU,EACtB,QAAQ,GAAE,wBAAoC,EAC9C,UAAU,CAAC,EAAE,MAAM,GAClB,gBAAgB,CAIlB"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
// Content-free diagnostic builders for the bounded large-document ingestion path
|
|
2
|
+
// (Epic #1160, Issue #1286). The contract `ParserDiagnostic.code` is a free `string`, so the
|
|
3
|
+
// large-document codes (PARTIAL_COVERAGE, OCR_CAPABILITY_UNAVAILABLE, ...) coexist with the
|
|
4
|
+
// closed `ParserErrorCode` union the existing adapters use. These diagnostics are never
|
|
5
|
+
// error-severity for capability/coverage limits, so downstream layers do not treat a missing
|
|
6
|
+
// optional capability as a hard document failure.
|
|
7
|
+
export function largeDocumentDiagnostic(code, message, documentId, severity = "warning", pageNumber) {
|
|
8
|
+
return pageNumber === undefined
|
|
9
|
+
? { severity, code, message, documentId }
|
|
10
|
+
: { severity, code, message, documentId, pageNumber };
|
|
11
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export { largeDocumentDiagnostic } from "./diagnostics.js";
|
|
2
|
+
export type { ProgressiveExtractionSource, ProgressiveExtractionWindow, ProgressiveExtractionOptions, ProgressiveExtractor, ProgressiveExtractionSink, ProgressiveExtractionSummary, ProgressiveStopReason, } from "./progressive-extraction.js";
|
|
3
|
+
export { runProgressiveExtraction } from "./progressive-extraction.js";
|
|
4
|
+
export { WindowTextBuilder } from "./window-builder.js";
|
|
5
|
+
export type { AddedPage } from "./window-builder.js";
|
|
6
|
+
export { createProgressivePdfExtractor, PROGRESSIVE_PDF_PARSER_ID, PROGRESSIVE_PDF_PARSER_VERSION, PROGRESSIVE_PDF_DEPENDENCY_VERSIONS, } from "./progressive-pdf.js";
|
|
7
|
+
export type { OcrPageFn, ProgressivePdfExtractorDeps } from "./progressive-pdf.js";
|
|
8
|
+
export { syntheticStreamingSource, syntheticProgressiveExtractor } from "./synthetic-source.js";
|
|
9
|
+
export type { SyntheticStreamingConfig } from "./synthetic-source.js";
|
|
10
|
+
export { discoverExtractionCapabilities, probeOcrCapability, probeMultimodalCapability, nullMultimodalAdapter, } from "./capability-discovery.js";
|
|
11
|
+
export type { CapabilityProbeDeps, MultimodalAdapter, MultimodalResult, } from "./capability-discovery.js";
|
|
12
|
+
export { isLegacyBinaryOfficeFormat, legacyFormatGuidance, legacyFormatDiagnostic, } from "./legacy-format.js";
|
|
13
|
+
export { classifyLargeDocument, usesProgressivePath } from "./preflight.js";
|
|
14
|
+
export type { PreflightInput } from "./preflight.js";
|
|
15
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/parsers/large-document/index.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,uBAAuB,EAAE,MAAM,kBAAkB,CAAC;AAC3D,YAAY,EACV,2BAA2B,EAC3B,2BAA2B,EAC3B,4BAA4B,EAC5B,oBAAoB,EACpB,yBAAyB,EACzB,4BAA4B,EAC5B,qBAAqB,GACtB,MAAM,6BAA6B,CAAC;AACrC,OAAO,EAAE,wBAAwB,EAAE,MAAM,6BAA6B,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,YAAY,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EACL,6BAA6B,EAC7B,yBAAyB,EACzB,8BAA8B,EAC9B,mCAAmC,GACpC,MAAM,sBAAsB,CAAC;AAC9B,YAAY,EAAE,SAAS,EAAE,2BAA2B,EAAE,MAAM,sBAAsB,CAAC;AACnF,OAAO,EAAE,wBAAwB,EAAE,6BAA6B,EAAE,MAAM,uBAAuB,CAAC;AAChG,YAAY,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACtE,OAAO,EACL,8BAA8B,EAC9B,kBAAkB,EAClB,yBAAyB,EACzB,qBAAqB,GACtB,MAAM,2BAA2B,CAAC;AACnC,YAAY,EACV,mBAAmB,EACnB,iBAAiB,EACjB,gBAAgB,GACjB,MAAM,2BAA2B,CAAC;AACnC,OAAO,EACL,0BAA0B,EAC1B,oBAAoB,EACpB,sBAAsB,GACvB,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,gBAAgB,CAAC;AAC5E,YAAY,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
// Public surface for the bounded large-document ingestion parser layer (Epic #1160,
|
|
2
|
+
// Issue #1286).
|
|
3
|
+
export { largeDocumentDiagnostic } from "./diagnostics.js";
|
|
4
|
+
export { runProgressiveExtraction } from "./progressive-extraction.js";
|
|
5
|
+
export { WindowTextBuilder } from "./window-builder.js";
|
|
6
|
+
export { createProgressivePdfExtractor, PROGRESSIVE_PDF_PARSER_ID, PROGRESSIVE_PDF_PARSER_VERSION, PROGRESSIVE_PDF_DEPENDENCY_VERSIONS, } from "./progressive-pdf.js";
|
|
7
|
+
export { syntheticStreamingSource, syntheticProgressiveExtractor } from "./synthetic-source.js";
|
|
8
|
+
export { discoverExtractionCapabilities, probeOcrCapability, probeMultimodalCapability, nullMultimodalAdapter, } from "./capability-discovery.js";
|
|
9
|
+
export { isLegacyBinaryOfficeFormat, legacyFormatGuidance, legacyFormatDiagnostic, } from "./legacy-format.js";
|
|
10
|
+
export { classifyLargeDocument, usesProgressivePath } from "./preflight.js";
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { DocumentId, ParserDiagnostic } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
export declare function isLegacyBinaryOfficeFormat(extension: string): boolean;
|
|
3
|
+
export declare function legacyFormatGuidance(extension: string): string | undefined;
|
|
4
|
+
export declare function legacyFormatDiagnostic(extension: string, documentId: DocumentId): ParserDiagnostic | undefined;
|
|
5
|
+
//# sourceMappingURL=legacy-format.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"legacy-format.d.ts","sourceRoot":"","sources":["../../../src/parsers/large-document/legacy-format.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AAWlF,wBAAgB,0BAA0B,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAErE;AAED,wBAAgB,oBAAoB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAE1E;AAED,wBAAgB,sBAAsB,CACpC,SAAS,EAAE,MAAM,EACjB,UAAU,EAAE,UAAU,GACrB,gBAAgB,GAAG,SAAS,CAS9B"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
// Controlled legacy-format handling (Epic #1160, Issue #1286).
|
|
2
|
+
//
|
|
3
|
+
// Legacy binary office formats (.doc, .ppt, .xls) predate the OOXML containers the docx/xlsx
|
|
4
|
+
// parsers handle. Keiko does not bundle a converter for them. Rather than crashing or emitting a
|
|
5
|
+
// generic "unknown format" signal, the large-document path reports a stable CONVERTER_UNAVAILABLE
|
|
6
|
+
// diagnostic with actionable, content-free guidance, leaving the indexing job stable.
|
|
7
|
+
import { LARGE_DOCUMENT_DIAGNOSTIC_CODES } from "@oscharko-dev/keiko-contracts";
|
|
8
|
+
import { largeDocumentDiagnostic } from "./diagnostics.js";
|
|
9
|
+
const LEGACY_FORMAT_GUIDANCE = Object.freeze({
|
|
10
|
+
doc: "Legacy Microsoft Word (.doc) is not supported; re-save the file as .docx or PDF and reindex.",
|
|
11
|
+
ppt: "Legacy Microsoft PowerPoint (.ppt) is not supported; re-save the file as .pptx or PDF and reindex.",
|
|
12
|
+
xls: "Legacy Microsoft Excel (.xls) is not supported; re-save the file as .xlsx or CSV and reindex.",
|
|
13
|
+
});
|
|
14
|
+
export function isLegacyBinaryOfficeFormat(extension) {
|
|
15
|
+
return extension.toLowerCase() in LEGACY_FORMAT_GUIDANCE;
|
|
16
|
+
}
|
|
17
|
+
export function legacyFormatGuidance(extension) {
|
|
18
|
+
return LEGACY_FORMAT_GUIDANCE[extension.toLowerCase()];
|
|
19
|
+
}
|
|
20
|
+
export function legacyFormatDiagnostic(extension, documentId) {
|
|
21
|
+
const guidance = legacyFormatGuidance(extension);
|
|
22
|
+
if (guidance === undefined)
|
|
23
|
+
return undefined;
|
|
24
|
+
return largeDocumentDiagnostic(LARGE_DOCUMENT_DIAGNOSTIC_CODES.CONVERTER_UNAVAILABLE, guidance, documentId, "warning");
|
|
25
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { LargeDocumentPreflight, LargeDocumentResourcePolicy } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
export interface PreflightInput {
|
|
3
|
+
readonly extension: string;
|
|
4
|
+
readonly mediaType: string;
|
|
5
|
+
readonly sizeBytes: number;
|
|
6
|
+
}
|
|
7
|
+
export declare function classifyLargeDocument(input: PreflightInput, policy: LargeDocumentResourcePolicy): LargeDocumentPreflight;
|
|
8
|
+
export declare function usesProgressivePath(preflight: LargeDocumentPreflight): boolean;
|
|
9
|
+
//# sourceMappingURL=preflight.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"preflight.d.ts","sourceRoot":"","sources":["../../../src/parsers/large-document/preflight.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EACV,sBAAsB,EACtB,2BAA2B,EAC5B,MAAM,+BAA+B,CAAC;AAIvC,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAQD,wBAAgB,qBAAqB,CACnC,KAAK,EAAE,cAAc,EACrB,MAAM,EAAE,2BAA2B,GAClC,sBAAsB,CA8BxB;AAID,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,sBAAsB,GAAG,OAAO,CAE9E"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
// Preflight large-file classification (Epic #1160, Issue #1286).
|
|
2
|
+
//
|
|
3
|
+
// Runs before parser execution. Decides the extraction strategy from the file's size, extension,
|
|
4
|
+
// and media type against the resource policy, so a file above the policy ceiling is rejected
|
|
5
|
+
// before any unsafe work begins, and a large supported PDF is routed to the progressive path.
|
|
6
|
+
// Pure function — no IO, no clock.
|
|
7
|
+
import { isLegacyBinaryOfficeFormat } from "./legacy-format.js";
|
|
8
|
+
function isPdf(input) {
|
|
9
|
+
return (input.extension.toLowerCase() === "pdf" || input.mediaType.toLowerCase() === "application/pdf");
|
|
10
|
+
}
|
|
11
|
+
export function classifyLargeDocument(input, policy) {
|
|
12
|
+
if (input.sizeBytes > policy.maxRawFileBytes) {
|
|
13
|
+
return {
|
|
14
|
+
strategy: "oversized",
|
|
15
|
+
decision: "reject-oversized",
|
|
16
|
+
sizeBytes: input.sizeBytes,
|
|
17
|
+
reason: `file size exceeds the configured ${String(policy.maxRawFileBytes)}-byte raw ceiling`,
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
if (isLegacyBinaryOfficeFormat(input.extension)) {
|
|
21
|
+
return {
|
|
22
|
+
strategy: "unsupported",
|
|
23
|
+
decision: "accept-standard",
|
|
24
|
+
sizeBytes: input.sizeBytes,
|
|
25
|
+
reason: "legacy binary office format has no bundled converter",
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
if (isPdf(input) && input.sizeBytes >= policy.largeFileThresholdBytes) {
|
|
29
|
+
const estimatedWindows = Math.max(1, Math.ceil(input.sizeBytes / (policy.largeFileThresholdBytes / 4)));
|
|
30
|
+
return {
|
|
31
|
+
strategy: "progressive-pdf",
|
|
32
|
+
decision: "accept-progressive",
|
|
33
|
+
sizeBytes: input.sizeBytes,
|
|
34
|
+
estimatedWindows,
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
return { strategy: "standard-buffer", decision: "accept-standard", sizeBytes: input.sizeBytes };
|
|
38
|
+
}
|
|
39
|
+
// True when the preflight decision means the discovery layer should run the progressive,
|
|
40
|
+
// page-windowed extraction path instead of the existing full-buffer path.
|
|
41
|
+
export function usesProgressivePath(preflight) {
|
|
42
|
+
return preflight.decision === "accept-progressive";
|
|
43
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import type { CoverageQuality, DocumentId, LargeDocumentExtractionStrategy, LargeDocumentResourcePolicy, PageRecord, ParsedUnit, ParserDependencyVersion, ParserDiagnostic } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
export interface ProgressiveExtractionSource {
|
|
3
|
+
readonly totalBytes: number;
|
|
4
|
+
readonly loadFullBuffer?: () => Promise<Uint8Array>;
|
|
5
|
+
readonly readWindow?: (startByte: number, length: number) => Promise<Uint8Array>;
|
|
6
|
+
}
|
|
7
|
+
export interface ProgressiveExtractionWindow {
|
|
8
|
+
readonly windowIndex: number;
|
|
9
|
+
readonly pages: readonly PageRecord[];
|
|
10
|
+
readonly units: readonly ParsedUnit[];
|
|
11
|
+
readonly text: string;
|
|
12
|
+
readonly characterStart: number;
|
|
13
|
+
readonly objectCursor: number;
|
|
14
|
+
readonly lastPageNumber: number;
|
|
15
|
+
readonly diagnostics: readonly ParserDiagnostic[];
|
|
16
|
+
}
|
|
17
|
+
export interface ProgressiveExtractionOptions {
|
|
18
|
+
readonly documentId: DocumentId;
|
|
19
|
+
readonly extension: string;
|
|
20
|
+
readonly mediaType: string;
|
|
21
|
+
readonly policy: LargeDocumentResourcePolicy;
|
|
22
|
+
readonly signal?: AbortSignal;
|
|
23
|
+
readonly now: () => number;
|
|
24
|
+
readonly resumeFromPage?: number;
|
|
25
|
+
readonly resumeCharacterStart?: number;
|
|
26
|
+
readonly resumeWindowIndex?: number;
|
|
27
|
+
readonly resumeObjectCursor?: number;
|
|
28
|
+
readonly resumeExtractedTextBytes?: number;
|
|
29
|
+
}
|
|
30
|
+
export interface ProgressiveExtractor {
|
|
31
|
+
readonly strategyId: LargeDocumentExtractionStrategy;
|
|
32
|
+
readonly parserVersion: string;
|
|
33
|
+
readonly dependencyVersions?: readonly ParserDependencyVersion[];
|
|
34
|
+
readonly matches: (input: {
|
|
35
|
+
readonly extension: string;
|
|
36
|
+
readonly mediaType: string;
|
|
37
|
+
}) => boolean;
|
|
38
|
+
readonly extractWindows: (source: ProgressiveExtractionSource, options: ProgressiveExtractionOptions) => AsyncIterable<ProgressiveExtractionWindow>;
|
|
39
|
+
}
|
|
40
|
+
export interface ProgressiveExtractionSink {
|
|
41
|
+
readonly onWindow: (window: ProgressiveExtractionWindow) => Promise<void> | void;
|
|
42
|
+
}
|
|
43
|
+
export type ProgressiveStopReason = "completed" | "cancelled" | "extracted-text-limit" | "object-limit" | "unit-limit" | "timeout";
|
|
44
|
+
export interface ProgressiveExtractionSummary {
|
|
45
|
+
readonly pageCount: number;
|
|
46
|
+
readonly windowCount: number;
|
|
47
|
+
readonly extractedTextBytes: number;
|
|
48
|
+
readonly objectCursor: number;
|
|
49
|
+
readonly lastPageNumber: number;
|
|
50
|
+
readonly diagnostics: readonly ParserDiagnostic[];
|
|
51
|
+
readonly stopReason: ProgressiveStopReason;
|
|
52
|
+
readonly coverage: CoverageQuality;
|
|
53
|
+
}
|
|
54
|
+
export declare function runProgressiveExtraction(extractor: ProgressiveExtractor, source: ProgressiveExtractionSource, options: ProgressiveExtractionOptions, sink: ProgressiveExtractionSink): Promise<ProgressiveExtractionSummary>;
|
|
55
|
+
//# sourceMappingURL=progressive-extraction.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"progressive-extraction.d.ts","sourceRoot":"","sources":["../../../src/parsers/large-document/progressive-extraction.ts"],"names":[],"mappings":"AAcA,OAAO,KAAK,EACV,eAAe,EACf,UAAU,EACV,+BAA+B,EAC/B,2BAA2B,EAC3B,UAAU,EACV,UAAU,EACV,uBAAuB,EACvB,gBAAgB,EACjB,MAAM,+BAA+B,CAAC;AAMvC,MAAM,WAAW,2BAA2B;IAC1C,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAE5B,QAAQ,CAAC,cAAc,CAAC,EAAE,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IAGpD,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,CAAC,CAAC;CAClF;AAGD,MAAM,WAAW,2BAA2B;IAC1C,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAE7B,QAAQ,CAAC,KAAK,EAAE,SAAS,UAAU,EAAE,CAAC;IACtC,QAAQ,CAAC,KAAK,EAAE,SAAS,UAAU,EAAE,CAAC;IAItC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAEtB,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAEhC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAE9B,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,WAAW,EAAE,SAAS,gBAAgB,EAAE,CAAC;CACnD;AAGD,MAAM,WAAW,4BAA4B;IAC3C,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;IAChC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,MAAM,EAAE,2BAA2B,CAAC;IAC7C,QAAQ,CAAC,MAAM,CAAC,EAAE,WAAW,CAAC;IAC9B,QAAQ,CAAC,GAAG,EAAE,MAAM,MAAM,CAAC;IAG3B,QAAQ,CAAC,cAAc,CAAC,EAAE,MAAM,CAAC;IACjC,QAAQ,CAAC,oBAAoB,CAAC,EAAE,MAAM,CAAC;IACvC,QAAQ,CAAC,iBAAiB,CAAC,EAAE,MAAM,CAAC;IACpC,QAAQ,CAAC,kBAAkB,CAAC,EAAE,MAAM,CAAC;IACrC,QAAQ,CAAC,wBAAwB,CAAC,EAAE,MAAM,CAAC;CAC5C;AAGD,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,UAAU,EAAE,+BAA+B,CAAC;IACrD,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,kBAAkB,CAAC,EAAE,SAAS,uBAAuB,EAAE,CAAC;IACjE,QAAQ,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE;QAAE,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;QAAC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAA;KAAE,KAAK,OAAO,CAAC;IACjG,QAAQ,CAAC,cAAc,EAAE,CACvB,MAAM,EAAE,2BAA2B,EACnC,OAAO,EAAE,4BAA4B,KAClC,aAAa,CAAC,2BAA2B,CAAC,CAAC;CACjD;AAGD,MAAM,WAAW,yBAAyB;IAGxC,QAAQ,CAAC,QAAQ,EAAE,CAAC,MAAM,EAAE,2BAA2B,KAAK,OAAO,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;CAClF;AAED,MAAM,MAAM,qBAAqB,GAC7B,WAAW,GACX,WAAW,GACX,sBAAsB,GACtB,cAAc,GACd,YAAY,GACZ,SAAS,CAAC;AAEd,MAAM,WAAW,4BAA4B;IAC3C,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,kBAAkB,EAAE,MAAM,CAAC;IACpC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,WAAW,EAAE,SAAS,gBAAgB,EAAE,CAAC;IAClD,QAAQ,CAAC,UAAU,EAAE,qBAAqB,CAAC;IAC3C,QAAQ,CAAC,QAAQ,EAAE,eAAe,CAAC;CACpC;AAuGD,wBAAsB,wBAAwB,CAC5C,SAAS,EAAE,oBAAoB,EAC/B,MAAM,EAAE,2BAA2B,EACnC,OAAO,EAAE,4BAA4B,EACrC,IAAI,EAAE,yBAAyB,GAC9B,OAAO,CAAC,4BAA4B,CAAC,CA+CvC"}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
// Progressive (page-windowed) extraction contract + driver for bounded large-document
|
|
2
|
+
// ingestion (Epic #1160, Issue #1286).
|
|
3
|
+
//
|
|
4
|
+
// This contract coexists with the existing full-buffer `ParserAdapter`/`AsyncParserAdapter`
|
|
5
|
+
// contracts. A `ProgressiveExtractor` yields the document as an ordered sequence of bounded
|
|
6
|
+
// `ProgressiveExtractionWindow` values. The driver `runProgressiveExtraction` pulls one window
|
|
7
|
+
// at a time, hands it to a sink that persists it durably, and never retains more than one
|
|
8
|
+
// window's text — so the extraction working set does not scale with document size.
|
|
9
|
+
//
|
|
10
|
+
// The `ProgressiveExtractionSource` abstracts byte access: a `loadFullBuffer` source backs the
|
|
11
|
+
// real PDF strategy (pdfjs-dist requires the whole buffer to open a document — see the
|
|
12
|
+
// architecture doc), while a `readWindow` source backs truly-streaming inputs that never
|
|
13
|
+
// materialize the whole document (used by the bounded-RSS regression).
|
|
14
|
+
import { LARGE_DOCUMENT_DIAGNOSTIC_CODES } from "@oscharko-dev/keiko-contracts";
|
|
15
|
+
import { largeDocumentDiagnostic } from "./diagnostics.js";
|
|
16
|
+
function utf8ByteLength(text) {
|
|
17
|
+
return Buffer.byteLength(text, "utf8");
|
|
18
|
+
}
|
|
19
|
+
function isAborted(signal) {
|
|
20
|
+
return signal?.aborted === true;
|
|
21
|
+
}
|
|
22
|
+
// Evaluates the policy against the state that *would* result from accepting `window`. Returns a
|
|
23
|
+
// stop reason when the window would breach a bound, so the over-limit window is never persisted.
|
|
24
|
+
function limitStop(state, window, options, startedAt) {
|
|
25
|
+
const { policy } = options;
|
|
26
|
+
if (isAborted(options.signal))
|
|
27
|
+
return "cancelled";
|
|
28
|
+
if (options.now() - startedAt > policy.maxWallClockMs)
|
|
29
|
+
return "timeout";
|
|
30
|
+
if (state.extractedTextBytes + utf8ByteLength(window.text) > policy.maxExtractedTextBytes) {
|
|
31
|
+
return "extracted-text-limit";
|
|
32
|
+
}
|
|
33
|
+
if (state.extractedTextBytes + utf8ByteLength(window.text) >
|
|
34
|
+
policy.maxPersistedStorageGrowthBytes) {
|
|
35
|
+
return "extracted-text-limit";
|
|
36
|
+
}
|
|
37
|
+
if (window.objectCursor > policy.maxParserObjects)
|
|
38
|
+
return "object-limit";
|
|
39
|
+
if (state.pageCount + window.pages.length > policy.maxParserUnits)
|
|
40
|
+
return "unit-limit";
|
|
41
|
+
return undefined;
|
|
42
|
+
}
|
|
43
|
+
function stopDiagnostic(reason, documentId) {
|
|
44
|
+
switch (reason) {
|
|
45
|
+
case "extracted-text-limit":
|
|
46
|
+
case "object-limit":
|
|
47
|
+
case "unit-limit":
|
|
48
|
+
return largeDocumentDiagnostic(LARGE_DOCUMENT_DIAGNOSTIC_CODES.RESOURCE_POLICY_EXCEEDED, `extraction stopped at the ${reason} resource bound; document indexed with partial coverage`, documentId, "warning");
|
|
49
|
+
case "timeout":
|
|
50
|
+
return largeDocumentDiagnostic(LARGE_DOCUMENT_DIAGNOSTIC_CODES.RESOURCE_POLICY_EXCEEDED, "extraction stopped at the wall-clock deadline; document indexed with partial coverage", documentId, "warning");
|
|
51
|
+
case "cancelled":
|
|
52
|
+
case "completed":
|
|
53
|
+
return undefined;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
function hasPartialCoverageWarning(diagnostics) {
|
|
57
|
+
return diagnostics.some((diagnostic) => diagnostic.severity !== "info" &&
|
|
58
|
+
(diagnostic.code === LARGE_DOCUMENT_DIAGNOSTIC_CODES.PARTIAL_COVERAGE ||
|
|
59
|
+
diagnostic.code === LARGE_DOCUMENT_DIAGNOSTIC_CODES.OCR_CAPABILITY_UNAVAILABLE ||
|
|
60
|
+
diagnostic.code === LARGE_DOCUMENT_DIAGNOSTIC_CODES.MULTIMODAL_CAPABILITY_UNAVAILABLE));
|
|
61
|
+
}
|
|
62
|
+
function coverageFor(stopReason, pageCount, diagnostics) {
|
|
63
|
+
if (pageCount === 0)
|
|
64
|
+
return "none";
|
|
65
|
+
if (hasPartialCoverageWarning(diagnostics))
|
|
66
|
+
return "partial";
|
|
67
|
+
if (stopReason === "completed")
|
|
68
|
+
return "complete";
|
|
69
|
+
if (stopReason === "cancelled")
|
|
70
|
+
return "partial";
|
|
71
|
+
// A resource/timeout stop means useful text was indexed but the document was truncated.
|
|
72
|
+
return "partial";
|
|
73
|
+
}
|
|
74
|
+
function appendDiagnostics(target, diagnostics) {
|
|
75
|
+
target.push(...diagnostics);
|
|
76
|
+
}
|
|
77
|
+
// Pulls windows one at a time, enforces the resource policy between windows, and flushes each
|
|
78
|
+
// window through the sink. Returns a content-free summary. The driver never holds more than one
|
|
79
|
+
// window's text in memory.
|
|
80
|
+
export async function runProgressiveExtraction(extractor, source, options, sink) {
|
|
81
|
+
const startedAt = options.now();
|
|
82
|
+
const diagnostics = [];
|
|
83
|
+
const state = {
|
|
84
|
+
pageCount: options.resumeFromPage ?? 0,
|
|
85
|
+
windowCount: options.resumeWindowIndex ?? 0,
|
|
86
|
+
extractedTextBytes: options.resumeExtractedTextBytes ?? utf8ByteLength(""),
|
|
87
|
+
objectCursor: options.resumeObjectCursor ?? 0,
|
|
88
|
+
lastPageNumber: options.resumeFromPage ?? 0,
|
|
89
|
+
};
|
|
90
|
+
let stopReason = "completed";
|
|
91
|
+
for await (const window of extractor.extractWindows(source, options)) {
|
|
92
|
+
const stop = limitStop(state, window, options, startedAt);
|
|
93
|
+
if (stop !== undefined) {
|
|
94
|
+
stopReason = stop;
|
|
95
|
+
break;
|
|
96
|
+
}
|
|
97
|
+
appendDiagnostics(diagnostics, window.diagnostics);
|
|
98
|
+
state.pageCount += window.pages.length;
|
|
99
|
+
state.windowCount += 1;
|
|
100
|
+
state.extractedTextBytes += utf8ByteLength(window.text);
|
|
101
|
+
state.objectCursor = window.objectCursor;
|
|
102
|
+
state.lastPageNumber = window.lastPageNumber;
|
|
103
|
+
await sink.onWindow(window);
|
|
104
|
+
// Re-check cancellation between sink flushes so cancellation lands within one window.
|
|
105
|
+
if (isAborted(options.signal)) {
|
|
106
|
+
stopReason = "cancelled";
|
|
107
|
+
break;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
const stopDiag = stopDiagnostic(stopReason, options.documentId);
|
|
111
|
+
if (stopDiag !== undefined)
|
|
112
|
+
diagnostics.push(stopDiag);
|
|
113
|
+
return {
|
|
114
|
+
pageCount: state.pageCount,
|
|
115
|
+
windowCount: state.windowCount,
|
|
116
|
+
extractedTextBytes: state.extractedTextBytes,
|
|
117
|
+
objectCursor: state.objectCursor,
|
|
118
|
+
lastPageNumber: state.lastPageNumber,
|
|
119
|
+
diagnostics,
|
|
120
|
+
stopReason,
|
|
121
|
+
coverage: coverageFor(stopReason, state.pageCount, diagnostics),
|
|
122
|
+
};
|
|
123
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { ExtractionCapabilityStatus, ParserDependencyVersion } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
import { type PdfDocumentLike } from "../pdf-parser.js";
|
|
3
|
+
import type { OcrPageResult } from "../ocr/types.js";
|
|
4
|
+
import type { ProgressiveExtractionSource, ProgressiveExtractor } from "./progressive-extraction.js";
|
|
5
|
+
export type OcrPageFn = (input: {
|
|
6
|
+
readonly pageNumber: number;
|
|
7
|
+
readonly bytes: Uint8Array;
|
|
8
|
+
}) => Promise<OcrPageResult>;
|
|
9
|
+
export interface ProgressivePdfExtractorDeps {
|
|
10
|
+
readonly loadDocument?: (source: ProgressiveExtractionSource) => Promise<PdfDocumentLike>;
|
|
11
|
+
readonly ocr?: {
|
|
12
|
+
readonly status: ExtractionCapabilityStatus;
|
|
13
|
+
readonly ocrPage: OcrPageFn;
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
export declare function createProgressivePdfExtractor(deps?: ProgressivePdfExtractorDeps): ProgressiveExtractor;
|
|
17
|
+
export declare const PROGRESSIVE_PDF_PARSER_ID = "progressive-pdf";
|
|
18
|
+
export declare const PROGRESSIVE_PDF_PARSER_VERSION = "1";
|
|
19
|
+
export declare const PROGRESSIVE_PDF_DEPENDENCY_VERSIONS: readonly ParserDependencyVersion[];
|
|
20
|
+
//# sourceMappingURL=progressive-pdf.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"progressive-pdf.d.ts","sourceRoot":"","sources":["../../../src/parsers/large-document/progressive-pdf.ts"],"names":[],"mappings":"AAaA,OAAO,KAAK,EAEV,0BAA0B,EAC1B,uBAAuB,EAExB,MAAM,+BAA+B,CAAC;AAIvC,OAAO,EAA2C,KAAK,eAAe,EAAE,MAAM,kBAAkB,CAAC;AACjG,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAIrD,OAAO,KAAK,EAEV,2BAA2B,EAE3B,oBAAoB,EACrB,MAAM,6BAA6B,CAAC;AASrC,MAAM,MAAM,SAAS,GAAG,CAAC,KAAK,EAAE;IAC9B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;CAC5B,KAAK,OAAO,CAAC,aAAa,CAAC,CAAC;AAE7B,MAAM,WAAW,2BAA2B;IAE1C,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC,MAAM,EAAE,2BAA2B,KAAK,OAAO,CAAC,eAAe,CAAC,CAAC;IAG1F,QAAQ,CAAC,GAAG,CAAC,EAAE;QAAE,QAAQ,CAAC,MAAM,EAAE,0BAA0B,CAAC;QAAC,QAAQ,CAAC,OAAO,EAAE,SAAS,CAAA;KAAE,CAAC;CAC7F;AA4LD,wBAAgB,6BAA6B,CAC3C,IAAI,GAAE,2BAAgC,GACrC,oBAAoB,CAYtB;AAED,eAAO,MAAM,yBAAyB,oBAAY,CAAC;AACnD,eAAO,MAAM,8BAA8B,MAAiB,CAAC;AAC7D,eAAO,MAAM,mCAAmC,oCAAsB,CAAC"}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
// Progressive PDF text-layer extractor (Epic #1160, Issue #1286).
|
|
2
|
+
//
|
|
3
|
+
// Reuses the existing pdfjs loader and per-page streaming text reader from `pdf-parser.ts`; the
|
|
4
|
+
// only behavior change is that pages are emitted in bounded windows instead of accumulated into
|
|
5
|
+
// one result. pdfjs-dist requires the whole byte buffer to open a document (documented upstream
|
|
6
|
+
// limitation), so the source's `loadFullBuffer` is used — but the extracted text, page records,
|
|
7
|
+
// and (downstream) chunks are flushed per window, so the extraction working set does not grow
|
|
8
|
+
// with document size and useful work begins after the first window.
|
|
9
|
+
//
|
|
10
|
+
// No-text-layer pages degrade gracefully: when an injected OCR capability is available, the
|
|
11
|
+
// page's OCR text is indexed with page provenance; otherwise a content-free partial-coverage
|
|
12
|
+
// quality warning is recorded and the page is skipped. Missing OCR never fails the job.
|
|
13
|
+
import { LARGE_DOCUMENT_DIAGNOSTIC_CODES } from "@oscharko-dev/keiko-contracts";
|
|
14
|
+
import { buildParserOptions } from "../registry.js";
|
|
15
|
+
import { loadPdfDocumentFromSource, readPageText } from "../pdf-parser.js";
|
|
16
|
+
import { largeDocumentDiagnostic } from "./diagnostics.js";
|
|
17
|
+
import { WindowTextBuilder } from "./window-builder.js";
|
|
18
|
+
const PARSER_ID = "progressive-pdf";
|
|
19
|
+
const PARSER_VERSION = "1";
|
|
20
|
+
const DEPENDENCY_VERSIONS = Object.freeze([
|
|
21
|
+
Object.freeze({ packageName: "pdfjs-dist", version: "6.0.227" }),
|
|
22
|
+
]);
|
|
23
|
+
function selectionInput(documentId) {
|
|
24
|
+
return {
|
|
25
|
+
documentId,
|
|
26
|
+
bytes: new Uint8Array(0),
|
|
27
|
+
extension: "pdf",
|
|
28
|
+
mediaType: "application/pdf",
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
function parserOptionsFromPolicy(options) {
|
|
32
|
+
const base = {
|
|
33
|
+
maxBytes: options.policy.maxRawFileBytes,
|
|
34
|
+
maxUnitsPerDocument: options.policy.maxParserUnits,
|
|
35
|
+
maxObjectsPerDocument: options.policy.maxParserObjects,
|
|
36
|
+
timeoutMs: options.policy.maxWallClockMs,
|
|
37
|
+
now: options.now,
|
|
38
|
+
};
|
|
39
|
+
return buildParserOptions(options.signal === undefined ? base : { ...base, signal: options.signal });
|
|
40
|
+
}
|
|
41
|
+
async function ocrTextForPage(deps, documentId, pageNumber) {
|
|
42
|
+
if (deps.ocr?.status !== "available") {
|
|
43
|
+
return {
|
|
44
|
+
diagnostic: largeDocumentDiagnostic(LARGE_DOCUMENT_DIAGNOSTIC_CODES.OCR_CAPABILITY_UNAVAILABLE, "page has no extractable text layer and no OCR capability is configured; indexed with partial coverage", documentId, "warning", pageNumber),
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
// A no-text-layer page needs a rasterized image for OCR. Keiko does not bundle a rasterizer, so
|
|
48
|
+
// production OCR remains an injected capability; the injected adapter receives the page number
|
|
49
|
+
// and returns text in tests. The empty byte view is the documented placeholder for the page
|
|
50
|
+
// image the rasterizer would supply.
|
|
51
|
+
const result = await deps.ocr.ocrPage({ pageNumber, bytes: new Uint8Array(0) });
|
|
52
|
+
if (result.ok && result.text.trim().length > 0) {
|
|
53
|
+
return { text: result.text, diagnostic: ocrIndexedDiagnostic(documentId, pageNumber) };
|
|
54
|
+
}
|
|
55
|
+
return {
|
|
56
|
+
diagnostic: largeDocumentDiagnostic(LARGE_DOCUMENT_DIAGNOSTIC_CODES.PARTIAL_COVERAGE, "OCR returned no text for a no-text-layer page; indexed with partial coverage", documentId, "warning", pageNumber),
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
function ocrIndexedDiagnostic(documentId, pageNumber) {
|
|
60
|
+
return largeDocumentDiagnostic(LARGE_DOCUMENT_DIAGNOSTIC_CODES.PARTIAL_COVERAGE, "page text recovered through the injected OCR capability with page provenance", documentId, "info", pageNumber);
|
|
61
|
+
}
|
|
62
|
+
// Reads a single page's text layer, falling back to the injected OCR capability for no-text-layer
|
|
63
|
+
// pages. Never throws; capability gaps become content-free quality warnings.
|
|
64
|
+
async function readOrOcrPage(doc, pageNumber, ctx, parserOptions, startedAt, emittedUnits, scannedObjects) {
|
|
65
|
+
const page = await doc.getPage(pageNumber);
|
|
66
|
+
const read = await readPageText(page, {
|
|
67
|
+
input: selectionInput(ctx.documentId),
|
|
68
|
+
options: parserOptions,
|
|
69
|
+
startedAt,
|
|
70
|
+
emittedUnits,
|
|
71
|
+
scannedObjects,
|
|
72
|
+
});
|
|
73
|
+
const diagnostics = [];
|
|
74
|
+
if (read.diagnostic !== undefined)
|
|
75
|
+
diagnostics.push(read.diagnostic);
|
|
76
|
+
if (read.text.length > 0) {
|
|
77
|
+
return { text: read.text, scannedObjects: read.scannedObjects, diagnostics };
|
|
78
|
+
}
|
|
79
|
+
const ocr = await ocrTextForPage(ctx, ctx.documentId, pageNumber);
|
|
80
|
+
diagnostics.push(ocr.diagnostic);
|
|
81
|
+
return ocr.text === undefined
|
|
82
|
+
? { scannedObjects: read.scannedObjects, diagnostics }
|
|
83
|
+
: { text: ocr.text, scannedObjects: read.scannedObjects, diagnostics };
|
|
84
|
+
}
|
|
85
|
+
async function extractPdfWindow(doc, firstPage, lastPage, state, ctx, parserOptions, startedAt) {
|
|
86
|
+
const builder = new WindowTextBuilder(ctx.documentId, state.cursor, state.anyPageEmitted);
|
|
87
|
+
const diagnostics = [];
|
|
88
|
+
for (let pageNumber = firstPage; pageNumber <= lastPage; pageNumber += 1) {
|
|
89
|
+
const outcome = await readOrOcrPage(doc, pageNumber, ctx, parserOptions, startedAt, state.emittedUnits, state.scannedObjects);
|
|
90
|
+
state.scannedObjects = outcome.scannedObjects;
|
|
91
|
+
diagnostics.push(...outcome.diagnostics);
|
|
92
|
+
if (outcome.text !== undefined && outcome.text.length > 0) {
|
|
93
|
+
builder.addPage(pageNumber, outcome.text);
|
|
94
|
+
state.emittedUnits += 1;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
state.cursor = builder.nextCursor;
|
|
98
|
+
state.anyPageEmitted = builder.hasEmittedAnyPage;
|
|
99
|
+
const window = {
|
|
100
|
+
windowIndex: state.windowIndex,
|
|
101
|
+
pages: builder.snapshotPages(),
|
|
102
|
+
units: builder.snapshotUnits(),
|
|
103
|
+
text: builder.text(),
|
|
104
|
+
characterStart: builder.characterStart,
|
|
105
|
+
objectCursor: state.scannedObjects,
|
|
106
|
+
lastPageNumber: lastPage,
|
|
107
|
+
diagnostics,
|
|
108
|
+
};
|
|
109
|
+
state.windowIndex += 1;
|
|
110
|
+
return window;
|
|
111
|
+
}
|
|
112
|
+
async function* pdfExtractWindows(loadDocument, ctx, source, options) {
|
|
113
|
+
const parserOptions = parserOptionsFromPolicy(options);
|
|
114
|
+
const doc = await loadDocument(source);
|
|
115
|
+
const resumeFromPage = options.resumeFromPage ?? 0;
|
|
116
|
+
const windowPages = Math.max(1, options.policy.extractionWindowPages);
|
|
117
|
+
const startedAt = options.now();
|
|
118
|
+
const state = {
|
|
119
|
+
cursor: options.resumeCharacterStart ?? 0,
|
|
120
|
+
anyPageEmitted: resumeFromPage > 0,
|
|
121
|
+
scannedObjects: options.resumeObjectCursor ?? 0,
|
|
122
|
+
emittedUnits: 0,
|
|
123
|
+
windowIndex: options.resumeWindowIndex ?? 0,
|
|
124
|
+
};
|
|
125
|
+
for (let firstPage = resumeFromPage + 1; firstPage <= doc.numPages; firstPage += windowPages) {
|
|
126
|
+
if (options.signal?.aborted === true)
|
|
127
|
+
return;
|
|
128
|
+
const lastPage = Math.min(firstPage + windowPages - 1, doc.numPages);
|
|
129
|
+
yield extractPdfWindow(doc, firstPage, lastPage, state, ctx, parserOptions, startedAt);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
export function createProgressivePdfExtractor(deps = {}) {
|
|
133
|
+
const loadDocument = deps.loadDocument ?? loadPdfDocumentFromSource;
|
|
134
|
+
return {
|
|
135
|
+
strategyId: "progressive-pdf",
|
|
136
|
+
parserVersion: PARSER_VERSION,
|
|
137
|
+
dependencyVersions: DEPENDENCY_VERSIONS,
|
|
138
|
+
matches: (input) => input.extension.toLowerCase() === "pdf" ||
|
|
139
|
+
input.mediaType.toLowerCase() === "application/pdf",
|
|
140
|
+
extractWindows: (source, options) => pdfExtractWindows(loadDocument, { ...deps, documentId: options.documentId }, source, options),
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
export const PROGRESSIVE_PDF_PARSER_ID = PARSER_ID;
|
|
144
|
+
export const PROGRESSIVE_PDF_PARSER_VERSION = PARSER_VERSION;
|
|
145
|
+
export const PROGRESSIVE_PDF_DEPENDENCY_VERSIONS = DEPENDENCY_VERSIONS;
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { ProgressiveExtractionSource, ProgressiveExtractor } from "./progressive-extraction.js";
|
|
2
|
+
export interface SyntheticStreamingConfig {
|
|
3
|
+
readonly totalPages: number;
|
|
4
|
+
readonly pageChars: number;
|
|
5
|
+
readonly pagesPerWindow: number;
|
|
6
|
+
}
|
|
7
|
+
export declare function syntheticStreamingSource(config: SyntheticStreamingConfig): ProgressiveExtractionSource;
|
|
8
|
+
export declare function syntheticProgressiveExtractor(config: SyntheticStreamingConfig, parserVersion?: string): ProgressiveExtractor;
|
|
9
|
+
//# sourceMappingURL=synthetic-source.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"synthetic-source.d.ts","sourceRoot":"","sources":["../../../src/parsers/large-document/synthetic-source.ts"],"names":[],"mappings":"AAeA,OAAO,KAAK,EAEV,2BAA2B,EAE3B,oBAAoB,EACrB,MAAM,6BAA6B,CAAC;AAGrC,MAAM,WAAW,wBAAwB;IACvC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;CACjC;AAyBD,wBAAgB,wBAAwB,CACtC,MAAM,EAAE,wBAAwB,GAC/B,2BAA2B,CAc7B;AAgFD,wBAAgB,6BAA6B,CAC3C,MAAM,EAAE,wBAAwB,EAChC,aAAa,SAAgB,GAC5B,oBAAoB,CAOtB"}
|