@oscharko-dev/keiko-local-knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/bounded-document-extraction.d.ts +27 -0
- package/dist/bounded-document-extraction.d.ts.map +1 -0
- package/dist/bounded-document-extraction.js +214 -0
- package/dist/capsule-lifecycle.d.ts +33 -0
- package/dist/capsule-lifecycle.d.ts.map +1 -0
- package/dist/capsule-lifecycle.js +292 -0
- package/dist/capsule-set-lifecycle.d.ts +15 -0
- package/dist/capsule-set-lifecycle.d.ts.map +1 -0
- package/dist/capsule-set-lifecycle.js +158 -0
- package/dist/chunking/chunker-persist.d.ts +36 -0
- package/dist/chunking/chunker-persist.d.ts.map +1 -0
- package/dist/chunking/chunker-persist.js +74 -0
- package/dist/chunking/chunker-runner.d.ts +9 -0
- package/dist/chunking/chunker-runner.d.ts.map +1 -0
- package/dist/chunking/chunker-runner.js +218 -0
- package/dist/chunking/chunker.d.ts +7 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +139 -0
- package/dist/chunking/citation-mapper.d.ts +4 -0
- package/dist/chunking/citation-mapper.d.ts.map +1 -0
- package/dist/chunking/citation-mapper.js +180 -0
- package/dist/chunking/index.d.ts +6 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +8 -0
- package/dist/chunking/token-estimator.d.ts +3 -0
- package/dist/chunking/token-estimator.d.ts.map +1 -0
- package/dist/chunking/token-estimator.js +26 -0
- package/dist/chunking/types.d.ts +49 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +26 -0
- package/dist/composition.d.ts +57 -0
- package/dist/composition.d.ts.map +1 -0
- package/dist/composition.js +310 -0
- package/dist/conversation/citation-attacher.d.ts +8 -0
- package/dist/conversation/citation-attacher.d.ts.map +1 -0
- package/dist/conversation/citation-attacher.js +55 -0
- package/dist/conversation/citation-excerpts.d.ts +4 -0
- package/dist/conversation/citation-excerpts.d.ts.map +1 -0
- package/dist/conversation/citation-excerpts.js +41 -0
- package/dist/conversation/grounded-answer-runner.d.ts +9 -0
- package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
- package/dist/conversation/grounded-answer-runner.js +61 -0
- package/dist/conversation/index.d.ts +5 -0
- package/dist/conversation/index.d.ts.map +1 -0
- package/dist/conversation/index.js +7 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
- package/dist/conversation/model-gateway-answer-generator.js +105 -0
- package/dist/conversation/types.d.ts +35 -0
- package/dist/conversation/types.d.ts.map +1 -0
- package/dist/conversation/types.js +24 -0
- package/dist/discovery/discovery-runner.d.ts +23 -0
- package/dist/discovery/discovery-runner.d.ts.map +1 -0
- package/dist/discovery/discovery-runner.js +109 -0
- package/dist/discovery/extract-progressive.d.ts +17 -0
- package/dist/discovery/extract-progressive.d.ts.map +1 -0
- package/dist/discovery/extract-progressive.js +522 -0
- package/dist/discovery/extract.d.ts +26 -0
- package/dist/discovery/extract.d.ts.map +1 -0
- package/dist/discovery/extract.js +906 -0
- package/dist/discovery/glob.d.ts +10 -0
- package/dist/discovery/glob.d.ts.map +1 -0
- package/dist/discovery/glob.js +72 -0
- package/dist/discovery/index.d.ts +6 -0
- package/dist/discovery/index.d.ts.map +1 -0
- package/dist/discovery/index.js +8 -0
- package/dist/discovery/media-type.d.ts +4 -0
- package/dist/discovery/media-type.d.ts.map +1 -0
- package/dist/discovery/media-type.js +62 -0
- package/dist/discovery/persist.d.ts +63 -0
- package/dist/discovery/persist.d.ts.map +1 -0
- package/dist/discovery/persist.js +345 -0
- package/dist/discovery/test-support.d.ts +16 -0
- package/dist/discovery/test-support.d.ts.map +1 -0
- package/dist/discovery/test-support.js +127 -0
- package/dist/discovery/types.d.ts +63 -0
- package/dist/discovery/types.d.ts.map +1 -0
- package/dist/discovery/types.js +28 -0
- package/dist/discovery/walk.d.ts +12 -0
- package/dist/discovery/walk.d.ts.map +1 -0
- package/dist/discovery/walk.js +302 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +22 -0
- package/dist/evaluations/dimensions.d.ts +14 -0
- package/dist/evaluations/dimensions.d.ts.map +1 -0
- package/dist/evaluations/dimensions.js +191 -0
- package/dist/evaluations/fixtures.d.ts +18 -0
- package/dist/evaluations/fixtures.d.ts.map +1 -0
- package/dist/evaluations/fixtures.js +858 -0
- package/dist/evaluations/index.d.ts +7 -0
- package/dist/evaluations/index.d.ts.map +1 -0
- package/dist/evaluations/index.js +10 -0
- package/dist/evaluations/report.d.ts +3 -0
- package/dist/evaluations/report.d.ts.map +1 -0
- package/dist/evaluations/report.js +31 -0
- package/dist/evaluations/runner-seed.d.ts +12 -0
- package/dist/evaluations/runner-seed.d.ts.map +1 -0
- package/dist/evaluations/runner-seed.js +175 -0
- package/dist/evaluations/runner.d.ts +8 -0
- package/dist/evaluations/runner.d.ts.map +1 -0
- package/dist/evaluations/runner.js +205 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
- package/dist/evaluations/scripted-embedding-adapter.js +163 -0
- package/dist/evaluations/types.d.ts +116 -0
- package/dist/evaluations/types.d.ts.map +1 -0
- package/dist/evaluations/types.js +27 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -0
- package/dist/indexing/bounded-indexing.d.ts +41 -0
- package/dist/indexing/bounded-indexing.d.ts.map +1 -0
- package/dist/indexing/bounded-indexing.js +240 -0
- package/dist/indexing/checkpoint-persist.d.ts +8 -0
- package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
- package/dist/indexing/checkpoint-persist.js +135 -0
- package/dist/indexing/checkpoint-resume.d.ts +20 -0
- package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
- package/dist/indexing/checkpoint-resume.js +50 -0
- package/dist/indexing/embedding-batcher.d.ts +3 -0
- package/dist/indexing/embedding-batcher.d.ts.map +1 -0
- package/dist/indexing/embedding-batcher.js +390 -0
- package/dist/indexing/index.d.ts +7 -0
- package/dist/indexing/index.d.ts.map +1 -0
- package/dist/indexing/index.js +11 -0
- package/dist/indexing/job-persist.d.ts +46 -0
- package/dist/indexing/job-persist.d.ts.map +1 -0
- package/dist/indexing/job-persist.js +157 -0
- package/dist/indexing/job-resume.d.ts +4 -0
- package/dist/indexing/job-resume.d.ts.map +1 -0
- package/dist/indexing/job-resume.js +14 -0
- package/dist/indexing/orchestrator.d.ts +3 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +1151 -0
- package/dist/indexing/types.d.ts +156 -0
- package/dist/indexing/types.d.ts.map +1 -0
- package/dist/indexing/types.js +30 -0
- package/dist/indexing/vector-persist.d.ts +32 -0
- package/dist/indexing/vector-persist.d.ts.map +1 -0
- package/dist/indexing/vector-persist.js +105 -0
- package/dist/parsers/_internal.d.ts +20 -0
- package/dist/parsers/_internal.d.ts.map +1 -0
- package/dist/parsers/_internal.js +122 -0
- package/dist/parsers/csv-parser.d.ts +3 -0
- package/dist/parsers/csv-parser.d.ts.map +1 -0
- package/dist/parsers/csv-parser.js +202 -0
- package/dist/parsers/docx-parser.d.ts +3 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +390 -0
- package/dist/parsers/html-parser.d.ts +3 -0
- package/dist/parsers/html-parser.d.ts.map +1 -0
- package/dist/parsers/html-parser.js +310 -0
- package/dist/parsers/index.d.ts +15 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/index.js +41 -0
- package/dist/parsers/json-parser.d.ts +3 -0
- package/dist/parsers/json-parser.d.ts.map +1 -0
- package/dist/parsers/json-parser.js +192 -0
- package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
- package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
- package/dist/parsers/large-document/capability-discovery.js +76 -0
- package/dist/parsers/large-document/diagnostics.d.ts +3 -0
- package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
- package/dist/parsers/large-document/diagnostics.js +11 -0
- package/dist/parsers/large-document/index.d.ts +15 -0
- package/dist/parsers/large-document/index.d.ts.map +1 -0
- package/dist/parsers/large-document/index.js +10 -0
- package/dist/parsers/large-document/legacy-format.d.ts +5 -0
- package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
- package/dist/parsers/large-document/legacy-format.js +25 -0
- package/dist/parsers/large-document/preflight.d.ts +9 -0
- package/dist/parsers/large-document/preflight.d.ts.map +1 -0
- package/dist/parsers/large-document/preflight.js +43 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-extraction.js +123 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-pdf.js +145 -0
- package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
- package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
- package/dist/parsers/large-document/synthetic-source.js +101 -0
- package/dist/parsers/large-document/window-builder.d.ts +24 -0
- package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
- package/dist/parsers/large-document/window-builder.js +75 -0
- package/dist/parsers/ocr/index.d.ts +4 -0
- package/dist/parsers/ocr/index.d.ts.map +1 -0
- package/dist/parsers/ocr/index.js +4 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
- package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
- package/dist/parsers/ocr/types.d.ts +16 -0
- package/dist/parsers/ocr/types.d.ts.map +1 -0
- package/dist/parsers/ocr/types.js +4 -0
- package/dist/parsers/parser-test-fixtures.d.ts +28 -0
- package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
- package/dist/parsers/parser-test-fixtures.js +139 -0
- package/dist/parsers/pdf-parser.d.ts +43 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +388 -0
- package/dist/parsers/registry.d.ts +8 -0
- package/dist/parsers/registry.d.ts.map +1 -0
- package/dist/parsers/registry.js +57 -0
- package/dist/parsers/text-parser.d.ts +3 -0
- package/dist/parsers/text-parser.d.ts.map +1 -0
- package/dist/parsers/text-parser.js +214 -0
- package/dist/parsers/types.d.ts +53 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers/types.js +21 -0
- package/dist/parsers/unsupported-parser.d.ts +4 -0
- package/dist/parsers/unsupported-parser.d.ts.map +1 -0
- package/dist/parsers/unsupported-parser.js +97 -0
- package/dist/parsers/xlsx-parser.d.ts +3 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +425 -0
- package/dist/privacy/audit-emitter.d.ts +5 -0
- package/dist/privacy/audit-emitter.d.ts.map +1 -0
- package/dist/privacy/audit-emitter.js +93 -0
- package/dist/privacy/diagnostic-redactor.d.ts +2 -0
- package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
- package/dist/privacy/diagnostic-redactor.js +153 -0
- package/dist/privacy/index.d.ts +5 -0
- package/dist/privacy/index.d.ts.map +1 -0
- package/dist/privacy/index.js +6 -0
- package/dist/privacy/retention-applier.d.ts +5 -0
- package/dist/privacy/retention-applier.d.ts.map +1 -0
- package/dist/privacy/retention-applier.js +88 -0
- package/dist/privacy/types.d.ts +98 -0
- package/dist/privacy/types.d.ts.map +1 -0
- package/dist/privacy/types.js +12 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
- package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
- package/dist/qualityIntelligence/index.d.ts +3 -0
- package/dist/qualityIntelligence/index.d.ts.map +1 -0
- package/dist/qualityIntelligence/index.js +5 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
- package/dist/qualityIntelligence/qiHandoff.js +82 -0
- package/dist/retrieval/answer-grounding.d.ts +9 -0
- package/dist/retrieval/answer-grounding.d.ts.map +1 -0
- package/dist/retrieval/answer-grounding.js +31 -0
- package/dist/retrieval/context-pack-assembler.d.ts +24 -0
- package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
- package/dist/retrieval/context-pack-assembler.js +50 -0
- package/dist/retrieval/index.d.ts +6 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/retrieval/index.js +9 -0
- package/dist/retrieval/retrieval-runner.d.ts +10 -0
- package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
- package/dist/retrieval/retrieval-runner.js +163 -0
- package/dist/retrieval/scoped-vector-search.d.ts +24 -0
- package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
- package/dist/retrieval/scoped-vector-search.js +864 -0
- package/dist/retrieval/types.d.ts +28 -0
- package/dist/retrieval/types.d.ts.map +1 -0
- package/dist/retrieval/types.js +33 -0
- package/dist/section-path-hash.d.ts +3 -0
- package/dist/section-path-hash.d.ts.map +1 -0
- package/dist/section-path-hash.js +9 -0
- package/dist/source-lifecycle.d.ts +14 -0
- package/dist/source-lifecycle.d.ts.map +1 -0
- package/dist/source-lifecycle.js +155 -0
- package/dist/source-routing-validation.d.ts +11 -0
- package/dist/source-routing-validation.d.ts.map +1 -0
- package/dist/source-routing-validation.js +140 -0
- package/dist/store-content-cipher.d.ts +11 -0
- package/dist/store-content-cipher.d.ts.map +1 -0
- package/dist/store-content-cipher.js +67 -0
- package/dist/store-content-encryption.d.ts +12 -0
- package/dist/store-content-encryption.d.ts.map +1 -0
- package/dist/store-content-encryption.js +275 -0
- package/dist/store-paths.d.ts +6 -0
- package/dist/store-paths.d.ts.map +1 -0
- package/dist/store-paths.js +61 -0
- package/dist/store.d.ts +30 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +219 -0
- package/dist/testing.d.ts +47 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +170 -0
- package/dist/version.d.ts +2 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +4 -0
- package/package.json +43 -0
|
@@ -0,0 +1,906 @@
|
|
|
1
|
+
// Per-file extraction (Epic #189, Issue #194). Given a discovered file, a parser registry,
|
|
2
|
+
// and an open KnowledgeStore, this module:
|
|
3
|
+
//
|
|
4
|
+
// 1. Resolves the file's realPath through the WorkspaceFs port and re-asserts the
|
|
5
|
+
// realpath-containment gate (defence in depth — the walker already filtered, but a
|
|
6
|
+
// consumer calling extractDocument() directly must not bypass the boundary).
|
|
7
|
+
// 2. Reads bytes via WorkspaceFs.readFileBytes (the boundary-checked byte-read path).
|
|
8
|
+
// 3. Computes the content hash (SHA-256 hex) over the raw bytes.
|
|
9
|
+
// 4. Detects the incremental fast-path: if a documents row with the same id already has
|
|
10
|
+
// this content_hash AND status="extracted"/"unsupported", we skip the parse entirely
|
|
11
|
+
// and leave last_extracted_at untouched.
|
|
12
|
+
// 5. Resolves a parser through the registry; rejects an oversized file BEFORE we hand it
|
|
13
|
+
// to the parser (the OVERSIZED_FILE diagnostic is the same code parsers emit).
|
|
14
|
+
// 6. Inside a single transaction: REPLACEs the documents row, deletes prior dependent
|
|
15
|
+
// rows, then inserts the new pages/sections/parsed_units/diagnostics.
|
|
16
|
+
import { createHash } from "node:crypto";
|
|
17
|
+
import { DEFAULT_EXTRACTION_CAPABILITY_AVAILABILITY, DEFAULT_LARGE_DOCUMENT_RESOURCE_POLICY, isSafeScopePath, } from "@oscharko-dev/keiko-contracts";
|
|
18
|
+
import { isDenied } from "@oscharko-dev/keiko-workspace";
|
|
19
|
+
import { buildParserOptions, createProgressivePdfExtractor, classifyLargeDocument, isLegacyBinaryOfficeFormat, legacyFormatDiagnostic, unsupportedParser, usesProgressivePath, } from "../parsers/index.js";
|
|
20
|
+
import { DEFAULT_CHUNKING_STRATEGY_KEY } from "../chunking/types.js";
|
|
21
|
+
import { extractDocumentProgressive, selectProgressiveExtractor, } from "./extract-progressive.js";
|
|
22
|
+
import { redactDiagnosticMessage } from "../privacy/diagnostic-redactor.js";
|
|
23
|
+
import { basenameOf, extensionOf, mediaTypeFor } from "./media-type.js";
|
|
24
|
+
import { compileGlobList, matchesAny } from "./glob.js";
|
|
25
|
+
import { deleteDependentRows, insertDiagnosticRow, insertDocumentRow, insertDocumentTextRow, insertPageRow, insertParsedUnitRow, insertSectionRow, readExistingDocumentRow, } from "./persist.js";
|
|
26
|
+
import { documentIdFor, } from "./types.js";
|
|
27
|
+
// ─── Path helpers (re-derived to keep extract.ts self-contained for the realpath gate) ──
|
|
28
|
+
// On Windows, WorkspaceFs.realPath() may return backslash-separated paths
|
|
29
|
+
// (e.g. C:\Users\workspace\file). Normalise both sides to forward slashes so
|
|
30
|
+
// containment checks work cross-platform.
|
|
31
|
+
function normaliseSep(p) {
|
|
32
|
+
return p.replace(/\\/g, "/");
|
|
33
|
+
}
|
|
34
|
+
function isContained(absoluteRoot, absolutePath) {
|
|
35
|
+
const normRoot = normaliseSep(absoluteRoot);
|
|
36
|
+
const normPath = normaliseSep(absolutePath);
|
|
37
|
+
if (normPath === normRoot)
|
|
38
|
+
return true;
|
|
39
|
+
const prefix = normRoot.endsWith("/") ? normRoot : `${normRoot}/`;
|
|
40
|
+
return normPath.startsWith(prefix);
|
|
41
|
+
}
|
|
42
|
+
function joinAbs(root, rel) {
|
|
43
|
+
if (root.endsWith("/"))
|
|
44
|
+
return `${root}${rel}`;
|
|
45
|
+
return `${root}/${rel}`;
|
|
46
|
+
}
|
|
47
|
+
function toPosixRelative(absoluteRoot, absolutePath) {
|
|
48
|
+
const normRoot = normaliseSep(absoluteRoot);
|
|
49
|
+
const normPath = normaliseSep(absolutePath);
|
|
50
|
+
if (normPath === normRoot)
|
|
51
|
+
return "";
|
|
52
|
+
const prefix = normRoot.endsWith("/") ? normRoot : `${normRoot}/`;
|
|
53
|
+
return normPath.startsWith(prefix) ? normPath.slice(prefix.length) : normPath;
|
|
54
|
+
}
|
|
55
|
+
function scopeRoot(source) {
|
|
56
|
+
const { scope } = source;
|
|
57
|
+
if (scope.kind === "folder")
|
|
58
|
+
return scope.rootPath;
|
|
59
|
+
if (scope.kind === "repository")
|
|
60
|
+
return scope.repositoryRoot;
|
|
61
|
+
return scope.rootPath;
|
|
62
|
+
}
|
|
63
|
+
function hashBytes(bytes) {
|
|
64
|
+
return createHash("sha256").update(bytes).digest("hex");
|
|
65
|
+
}
|
|
66
|
+
function safeDisplay(relativePath) {
|
|
67
|
+
const base = basenameOf(relativePath);
|
|
68
|
+
return base.length === 0 ? relativePath : base;
|
|
69
|
+
}
|
|
70
|
+
function safeRelativePath(relativePath) {
|
|
71
|
+
const normalised = normaliseSep(relativePath);
|
|
72
|
+
if (normalised.startsWith("/")) {
|
|
73
|
+
return {
|
|
74
|
+
code: "INVALID_SCOPE",
|
|
75
|
+
message: "file path failed the selected-scope policy",
|
|
76
|
+
relativePath,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
if (!isSafeScopePath(normalised)) {
|
|
80
|
+
return {
|
|
81
|
+
code: "INVALID_SCOPE",
|
|
82
|
+
message: "file path failed the selected-scope policy",
|
|
83
|
+
relativePath,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
return normalised;
|
|
87
|
+
}
|
|
88
|
+
function redactionPrefixFor(source) {
|
|
89
|
+
return scopeRoot(source);
|
|
90
|
+
}
|
|
91
|
+
function redactMessage(message, source) {
|
|
92
|
+
return redactDiagnosticMessage(message, redactionPrefixFor(source));
|
|
93
|
+
}
|
|
94
|
+
function redactDiagnostic(diagnostic, source) {
|
|
95
|
+
return {
|
|
96
|
+
...diagnostic,
|
|
97
|
+
message: redactMessage(diagnostic.message, source),
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
function redactDiagnostics(diagnostics, source) {
|
|
101
|
+
return diagnostics.map((diagnostic) => redactDiagnostic(diagnostic, source));
|
|
102
|
+
}
|
|
103
|
+
function redactParserResult(parserResult, source) {
|
|
104
|
+
return {
|
|
105
|
+
...parserResult,
|
|
106
|
+
diagnostics: redactDiagnostics(parserResult.diagnostics, source),
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
// ─── Failure / unsupported helpers ───────────────────────────────────────────
|
|
110
|
+
function persistFailureRow(deps, params, documentId, document, diagnostic, now) {
|
|
111
|
+
const db = deps.store._internal.db;
|
|
112
|
+
db.exec("BEGIN");
|
|
113
|
+
try {
|
|
114
|
+
insertDocumentRow(db, {
|
|
115
|
+
id: documentId,
|
|
116
|
+
capsuleId: params.capsuleId,
|
|
117
|
+
sourceId: String(params.source.id),
|
|
118
|
+
documentPath: document.documentPath,
|
|
119
|
+
sizeBytes: document.sizeBytes,
|
|
120
|
+
mediaType: document.mediaType,
|
|
121
|
+
contentHash: document.contentHash,
|
|
122
|
+
parserId: document.parser.parserId,
|
|
123
|
+
parserVersion: document.parser.parserVersion,
|
|
124
|
+
lastExtractedAt: document.lastExtractedAt,
|
|
125
|
+
status: document.status,
|
|
126
|
+
safeDisplayName: document.safeDisplayName,
|
|
127
|
+
});
|
|
128
|
+
deleteDependentRows(db, params.capsuleId, documentId);
|
|
129
|
+
insertDiagnosticRow(db, {
|
|
130
|
+
id: `${String(documentId)}#d0`,
|
|
131
|
+
capsuleId: params.capsuleId,
|
|
132
|
+
diagnostic,
|
|
133
|
+
createdAt: now(),
|
|
134
|
+
});
|
|
135
|
+
db.exec("COMMIT");
|
|
136
|
+
}
|
|
137
|
+
catch (cause) {
|
|
138
|
+
db.exec("ROLLBACK");
|
|
139
|
+
throw cause;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
// GRD-010: decide whether to persist a cascade-deleting failure row. A TRANSIENT IO failure
|
|
143
|
+
// (READ_FAILED / STAT_FAILED) on an incremental refresh must NOT destroy a previously-good index
|
|
144
|
+
// — persisting overwrites the document row and CASCADE-deletes its chunks+vectors. When a prior
|
|
145
|
+
// NON-failed row exists, skip persistence so a momentary lock / NFS hiccup / permission flap
|
|
146
|
+
// preserves retrievable content (the orchestrator then reports a non-destructive skip). Permanent
|
|
147
|
+
// failures (MALFORMED_INPUT, PARSER_FAILED, OVERSIZED_FILE, …) and first-time failures still persist.
|
|
148
|
+
function shouldPersistFailureRow(deps, params, documentId, error, optionPersist) {
|
|
149
|
+
if (!optionPersist)
|
|
150
|
+
return false;
|
|
151
|
+
const isTransient = error.code === "READ_FAILED" || error.code === "STAT_FAILED";
|
|
152
|
+
if (!isTransient)
|
|
153
|
+
return true;
|
|
154
|
+
const existing = readExistingDocumentRow(deps.store._internal.db, params.capsuleId, documentId);
|
|
155
|
+
return existing === undefined || existing.status === "failed";
|
|
156
|
+
}
|
|
157
|
+
function buildFailureResult(deps, params, documentId, error, options = { persist: true }) {
|
|
158
|
+
const now = deps.store._internal.now;
|
|
159
|
+
const redactedMessage = redactMessage(error.message, params.source);
|
|
160
|
+
const diagnostic = {
|
|
161
|
+
severity: "error",
|
|
162
|
+
code: error.code,
|
|
163
|
+
message: redactedMessage,
|
|
164
|
+
documentId,
|
|
165
|
+
};
|
|
166
|
+
const document = {
|
|
167
|
+
id: documentId,
|
|
168
|
+
capsuleId: params.capsuleId,
|
|
169
|
+
sourceId: params.source.id,
|
|
170
|
+
documentPath: params.file.relativePath,
|
|
171
|
+
sizeBytes: params.file.sizeBytes,
|
|
172
|
+
mediaType: mediaTypeFor(extensionOf(params.file.relativePath)),
|
|
173
|
+
contentHash: "",
|
|
174
|
+
parser: { parserId: "none", parserVersion: "0" },
|
|
175
|
+
lastExtractedAt: now(),
|
|
176
|
+
status: "failed",
|
|
177
|
+
safeDisplayName: safeDisplay(params.file.relativePath),
|
|
178
|
+
};
|
|
179
|
+
if (shouldPersistFailureRow(deps, params, documentId, error, options.persist)) {
|
|
180
|
+
persistFailureRow(deps, params, documentId, document, diagnostic, now);
|
|
181
|
+
}
|
|
182
|
+
const outcome = {
|
|
183
|
+
kind: "failed",
|
|
184
|
+
document,
|
|
185
|
+
error: { ...error, message: redactedMessage, relativePath: params.file.relativePath },
|
|
186
|
+
};
|
|
187
|
+
return {
|
|
188
|
+
capsuleId: params.capsuleId,
|
|
189
|
+
sourceId: params.source.id,
|
|
190
|
+
relativePath: params.file.relativePath,
|
|
191
|
+
outcome,
|
|
192
|
+
diagnostics: [diagnostic],
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
// ─── Persist helpers (run inside the per-file transaction) ───────────────────
|
|
196
|
+
function persistDependentRows(deps, capsuleId, documentId, parserResult, now) {
|
|
197
|
+
const db = deps.store._internal.db;
|
|
198
|
+
deleteDependentRows(db, capsuleId, documentId);
|
|
199
|
+
if (parserResult.normalizedText !== undefined) {
|
|
200
|
+
insertDocumentTextRow(db, deps.store._internal.contentCipher, capsuleId, documentId, parserResult.normalizedText);
|
|
201
|
+
}
|
|
202
|
+
for (const page of parserResult.pages)
|
|
203
|
+
insertPageRow(db, capsuleId, page);
|
|
204
|
+
for (const section of parserResult.sections) {
|
|
205
|
+
insertSectionRow(db, deps.store._internal.contentCipher, capsuleId, section);
|
|
206
|
+
}
|
|
207
|
+
parserResult.units.forEach((unit, index) => {
|
|
208
|
+
insertParsedUnitRow(db, deps.store._internal.contentCipher, capsuleId, `${String(documentId)}#u${String(index)}`, unit);
|
|
209
|
+
});
|
|
210
|
+
parserResult.diagnostics.forEach((diagnostic, index) => {
|
|
211
|
+
insertDiagnosticRow(db, {
|
|
212
|
+
id: `${String(documentId)}#d${String(index)}`,
|
|
213
|
+
capsuleId,
|
|
214
|
+
diagnostic,
|
|
215
|
+
createdAt: now(),
|
|
216
|
+
});
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
function buildDocumentRecord(input) {
|
|
220
|
+
return {
|
|
221
|
+
id: input.documentId,
|
|
222
|
+
capsuleId: input.params.capsuleId,
|
|
223
|
+
sourceId: input.params.source.id,
|
|
224
|
+
documentPath: input.params.file.relativePath,
|
|
225
|
+
sizeBytes: input.params.file.sizeBytes,
|
|
226
|
+
mediaType: input.mediaType,
|
|
227
|
+
contentHash: input.contentHash,
|
|
228
|
+
parser: input.parserResult.parser,
|
|
229
|
+
lastExtractedAt: input.parserResult.extractedAt,
|
|
230
|
+
status: input.status,
|
|
231
|
+
safeDisplayName: safeDisplay(input.params.file.relativePath),
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
function persistDocumentAndDependents(deps, params, documentId, document, parserResult, now) {
|
|
235
|
+
const db = deps.store._internal.db;
|
|
236
|
+
db.exec("BEGIN");
|
|
237
|
+
try {
|
|
238
|
+
insertDocumentRow(db, {
|
|
239
|
+
id: documentId,
|
|
240
|
+
capsuleId: params.capsuleId,
|
|
241
|
+
sourceId: String(params.source.id),
|
|
242
|
+
documentPath: document.documentPath,
|
|
243
|
+
sizeBytes: document.sizeBytes,
|
|
244
|
+
mediaType: document.mediaType,
|
|
245
|
+
contentHash: document.contentHash,
|
|
246
|
+
parserId: document.parser.parserId,
|
|
247
|
+
parserVersion: document.parser.parserVersion,
|
|
248
|
+
lastExtractedAt: document.lastExtractedAt,
|
|
249
|
+
status: document.status,
|
|
250
|
+
safeDisplayName: document.safeDisplayName,
|
|
251
|
+
});
|
|
252
|
+
persistDependentRows(deps, params.capsuleId, documentId, parserResult, now);
|
|
253
|
+
db.exec("COMMIT");
|
|
254
|
+
}
|
|
255
|
+
catch (cause) {
|
|
256
|
+
db.exec("ROLLBACK");
|
|
257
|
+
throw cause;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
const HIDDEN_OR_GENERATED_DIRS = new Set([
|
|
261
|
+
".git",
|
|
262
|
+
".hg",
|
|
263
|
+
".svn",
|
|
264
|
+
".next",
|
|
265
|
+
".turbo",
|
|
266
|
+
"node_modules",
|
|
267
|
+
"dist",
|
|
268
|
+
"build",
|
|
269
|
+
"coverage",
|
|
270
|
+
"out",
|
|
271
|
+
]);
|
|
272
|
+
function deriveScopePolicy(scope) {
|
|
273
|
+
if (scope.kind === "folder") {
|
|
274
|
+
if (!isSafeScopePath(scope.rootPath)) {
|
|
275
|
+
return { code: "INVALID_SCOPE", message: "scope.rootPath failed the safe-path gate" };
|
|
276
|
+
}
|
|
277
|
+
return {
|
|
278
|
+
rootPath: scope.rootPath,
|
|
279
|
+
recursive: scope.recursive,
|
|
280
|
+
includeGlobs: compileGlobList(scope.includeGlobs),
|
|
281
|
+
excludeGlobs: compileGlobList(scope.excludeGlobs),
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
if (scope.kind === "repository") {
|
|
285
|
+
if (!isSafeScopePath(scope.repositoryRoot)) {
|
|
286
|
+
return { code: "INVALID_SCOPE", message: "scope.repositoryRoot failed the safe-path gate" };
|
|
287
|
+
}
|
|
288
|
+
return {
|
|
289
|
+
rootPath: scope.repositoryRoot,
|
|
290
|
+
recursive: true,
|
|
291
|
+
includeGlobs: compileGlobList(scope.includeGlobs),
|
|
292
|
+
excludeGlobs: compileGlobList(scope.excludeGlobs),
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
if (!isSafeScopePath(scope.rootPath)) {
|
|
296
|
+
return { code: "INVALID_SCOPE", message: "scope.rootPath failed the safe-path gate" };
|
|
297
|
+
}
|
|
298
|
+
const explicitFiles = new Set();
|
|
299
|
+
for (const entry of scope.files) {
|
|
300
|
+
const safeEntry = safeRelativePath(entry);
|
|
301
|
+
if (typeof safeEntry !== "string") {
|
|
302
|
+
return {
|
|
303
|
+
code: "INVALID_SCOPE",
|
|
304
|
+
message: `scope.files entry failed the safe-path gate: ${entry}`,
|
|
305
|
+
};
|
|
306
|
+
}
|
|
307
|
+
explicitFiles.add(safeEntry);
|
|
308
|
+
}
|
|
309
|
+
return {
|
|
310
|
+
rootPath: scope.rootPath,
|
|
311
|
+
recursive: false,
|
|
312
|
+
includeGlobs: [],
|
|
313
|
+
excludeGlobs: [],
|
|
314
|
+
explicitFiles,
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
function matchesSourceGlobs(policy, relativePath) {
|
|
318
|
+
if (matchesAny(policy.excludeGlobs, relativePath, false))
|
|
319
|
+
return false;
|
|
320
|
+
return matchesAny(policy.includeGlobs, relativePath, true);
|
|
321
|
+
}
|
|
322
|
+
function hasHiddenOrGeneratedParent(relativePath) {
|
|
323
|
+
const segments = relativePath.split("/").filter((segment) => segment.length > 0);
|
|
324
|
+
for (const segment of segments.slice(0, -1)) {
|
|
325
|
+
if (segment.startsWith(".") || HIDDEN_OR_GENERATED_DIRS.has(segment))
|
|
326
|
+
return true;
|
|
327
|
+
}
|
|
328
|
+
return false;
|
|
329
|
+
}
|
|
330
|
+
function isSelectedByScope(policy, relativePath) {
|
|
331
|
+
if (isDenied(relativePath))
|
|
332
|
+
return false;
|
|
333
|
+
if (policy.explicitFiles !== undefined)
|
|
334
|
+
return policy.explicitFiles.has(relativePath);
|
|
335
|
+
if (!policy.recursive && relativePath.includes("/"))
|
|
336
|
+
return false;
|
|
337
|
+
if (hasHiddenOrGeneratedParent(relativePath))
|
|
338
|
+
return false;
|
|
339
|
+
return matchesSourceGlobs(policy, relativePath);
|
|
340
|
+
}
|
|
341
|
+
function targetError(error, persistFailure) {
|
|
342
|
+
return { error, persistFailure };
|
|
343
|
+
}
|
|
344
|
+
function selectedRelativePath(policy, rawRelativePath) {
|
|
345
|
+
const relativePath = safeRelativePath(rawRelativePath);
|
|
346
|
+
if (typeof relativePath !== "string") {
|
|
347
|
+
return targetError(relativePath, false);
|
|
348
|
+
}
|
|
349
|
+
if (!isSelectedByScope(policy, relativePath)) {
|
|
350
|
+
return targetError({
|
|
351
|
+
code: "INVALID_SCOPE",
|
|
352
|
+
message: "file is outside the selected source scope",
|
|
353
|
+
relativePath,
|
|
354
|
+
}, false);
|
|
355
|
+
}
|
|
356
|
+
return relativePath;
|
|
357
|
+
}
|
|
358
|
+
function resolveRealPathTarget(deps, path, relativePath, message) {
|
|
359
|
+
try {
|
|
360
|
+
return deps.fs.realPath(path);
|
|
361
|
+
}
|
|
362
|
+
catch {
|
|
363
|
+
return targetError({ code: "READ_FAILED", message, relativePath }, true);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
function containedRealFileTarget(realRoot, real, relativePath) {
|
|
367
|
+
if (!isContained(realRoot, real)) {
|
|
368
|
+
return targetError({
|
|
369
|
+
code: "PATH_ESCAPE",
|
|
370
|
+
message: `realpath escapes scope root: ${relativePath}`,
|
|
371
|
+
relativePath,
|
|
372
|
+
}, true);
|
|
373
|
+
}
|
|
374
|
+
const realRelativePath = toPosixRelative(realRoot, real);
|
|
375
|
+
if (isDenied(realRelativePath)) {
|
|
376
|
+
return targetError({
|
|
377
|
+
code: "READ_FAILED",
|
|
378
|
+
message: "resolved file is denied by workspace policy",
|
|
379
|
+
relativePath,
|
|
380
|
+
}, true);
|
|
381
|
+
}
|
|
382
|
+
return undefined;
|
|
383
|
+
}
|
|
384
|
+
function resolveTargetPath(deps, params) {
|
|
385
|
+
const policy = deriveScopePolicy(params.source.scope);
|
|
386
|
+
if ("code" in policy) {
|
|
387
|
+
return targetError(policy, false);
|
|
388
|
+
}
|
|
389
|
+
const relativePath = selectedRelativePath(policy, params.file.relativePath);
|
|
390
|
+
if (typeof relativePath !== "string") {
|
|
391
|
+
return relativePath;
|
|
392
|
+
}
|
|
393
|
+
const root = policy.rootPath;
|
|
394
|
+
const absolute = joinAbs(root, relativePath);
|
|
395
|
+
const realRoot = resolveRealPathTarget(deps, root, relativePath, "realPath failed for selected source root");
|
|
396
|
+
if (typeof realRoot !== "string")
|
|
397
|
+
return realRoot;
|
|
398
|
+
const real = resolveRealPathTarget(deps, absolute, relativePath, "realPath failed for selected file");
|
|
399
|
+
if (typeof real !== "string")
|
|
400
|
+
return real;
|
|
401
|
+
const containmentError = containedRealFileTarget(realRoot, real, relativePath);
|
|
402
|
+
if (containmentError !== undefined)
|
|
403
|
+
return containmentError;
|
|
404
|
+
// Normalise to forward slashes so subsequent IO calls (readFileBytes, stat) receive
|
|
405
|
+
// a consistent path even when realPath returned a Windows backslash path.
|
|
406
|
+
return { absolutePath: normaliseSep(real), requestedAbsolutePath: absolute, relativePath };
|
|
407
|
+
}
|
|
408
|
+
function validateRequestedTarget(deps, params, target) {
|
|
409
|
+
try {
|
|
410
|
+
deps.fs.stat(target.requestedAbsolutePath);
|
|
411
|
+
return undefined;
|
|
412
|
+
}
|
|
413
|
+
catch {
|
|
414
|
+
return {
|
|
415
|
+
code: "STAT_FAILED",
|
|
416
|
+
message: "stat failed for selected file",
|
|
417
|
+
relativePath: params.file.relativePath,
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
function validateResolvedTarget(deps, params, target) {
|
|
422
|
+
try {
|
|
423
|
+
const realStat = deps.fs.stat(target.absolutePath);
|
|
424
|
+
if (!realStat.isFile) {
|
|
425
|
+
return {
|
|
426
|
+
code: "READ_FAILED",
|
|
427
|
+
message: "selected path is not a file",
|
|
428
|
+
relativePath: params.file.relativePath,
|
|
429
|
+
};
|
|
430
|
+
}
|
|
431
|
+
if (realStat.hardLinkCount === undefined || realStat.hardLinkCount <= 1)
|
|
432
|
+
return undefined;
|
|
433
|
+
return {
|
|
434
|
+
code: "READ_FAILED",
|
|
435
|
+
message: "selected file is not eligible for extraction",
|
|
436
|
+
relativePath: params.file.relativePath,
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
catch {
|
|
440
|
+
return {
|
|
441
|
+
code: "STAT_FAILED",
|
|
442
|
+
message: "stat failed for selected file",
|
|
443
|
+
relativePath: params.file.relativePath,
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
async function readBytes(deps, params, target, maxBytes) {
|
|
448
|
+
const reader = deps.fs.readFileBytes;
|
|
449
|
+
if (reader === undefined) {
|
|
450
|
+
return {
|
|
451
|
+
code: "READ_FAILED",
|
|
452
|
+
message: "WorkspaceFs.readFileBytes is unavailable",
|
|
453
|
+
relativePath: params.file.relativePath,
|
|
454
|
+
};
|
|
455
|
+
}
|
|
456
|
+
const requestedError = validateRequestedTarget(deps, params, target);
|
|
457
|
+
if (requestedError !== undefined)
|
|
458
|
+
return requestedError;
|
|
459
|
+
const resolvedError = validateResolvedTarget(deps, params, target);
|
|
460
|
+
if (resolvedError !== undefined)
|
|
461
|
+
return resolvedError;
|
|
462
|
+
try {
|
|
463
|
+
return await reader(target.absolutePath, maxBytes);
|
|
464
|
+
}
|
|
465
|
+
catch {
|
|
466
|
+
return {
|
|
467
|
+
code: "READ_FAILED",
|
|
468
|
+
message: "readFileBytes failed for selected file",
|
|
469
|
+
relativePath: params.file.relativePath,
|
|
470
|
+
};
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
// ─── Incremental fast-path ───────────────────────────────────────────────────
|
|
474
|
+
function readUnchangedFastPath(deps, params, documentId, contentHash) {
|
|
475
|
+
const existing = readExistingDocumentRow(deps.store._internal.db, params.capsuleId, documentId);
|
|
476
|
+
if (existing === undefined)
|
|
477
|
+
return undefined;
|
|
478
|
+
if (existing.content_hash !== contentHash)
|
|
479
|
+
return undefined;
|
|
480
|
+
// Skip only terminal-good states. A `pending` row is an interrupted progressive extraction
|
|
481
|
+
// (Issue #1286) and `failed` should be retried; both must re-extract rather than be skipped.
|
|
482
|
+
if (existing.status !== "extracted" && existing.status !== "unsupported")
|
|
483
|
+
return undefined;
|
|
484
|
+
const document = {
|
|
485
|
+
id: documentId,
|
|
486
|
+
capsuleId: params.capsuleId,
|
|
487
|
+
sourceId: params.source.id,
|
|
488
|
+
documentPath: existing.document_path,
|
|
489
|
+
sizeBytes: existing.size_bytes,
|
|
490
|
+
mediaType: existing.media_type,
|
|
491
|
+
contentHash: existing.content_hash,
|
|
492
|
+
parser: {
|
|
493
|
+
parserId: existing.parser_id,
|
|
494
|
+
parserVersion: existing.parser_version,
|
|
495
|
+
},
|
|
496
|
+
lastExtractedAt: existing.last_extracted_at,
|
|
497
|
+
status: existing.status,
|
|
498
|
+
safeDisplayName: existing.safe_display_name,
|
|
499
|
+
};
|
|
500
|
+
return {
|
|
501
|
+
capsuleId: params.capsuleId,
|
|
502
|
+
sourceId: params.source.id,
|
|
503
|
+
relativePath: params.file.relativePath,
|
|
504
|
+
outcome: { kind: "skipped", document, reason: "unchanged" },
|
|
505
|
+
diagnostics: [],
|
|
506
|
+
};
|
|
507
|
+
}
|
|
508
|
+
// ─── Large-document routing (Epic #1160, Issue #1286) ────────────────────────
|
|
509
|
+
let defaultProgressiveExtractorsCache;
|
|
510
|
+
function defaultProgressiveExtractors() {
|
|
511
|
+
defaultProgressiveExtractorsCache ??= [createProgressivePdfExtractor()];
|
|
512
|
+
return defaultProgressiveExtractorsCache;
|
|
513
|
+
}
|
|
514
|
+
function largeDocumentContextFor(deps, resolved, options) {
|
|
515
|
+
return {
|
|
516
|
+
policy: deps.largeDocumentPolicy ?? DEFAULT_LARGE_DOCUMENT_RESOURCE_POLICY,
|
|
517
|
+
capabilities: deps.extractionCapabilities ?? DEFAULT_EXTRACTION_CAPABILITY_AVAILABILITY,
|
|
518
|
+
extractors: deps.progressiveExtractors ?? defaultProgressiveExtractors(),
|
|
519
|
+
jobId: deps.largeDocumentJobId ?? "extract",
|
|
520
|
+
chunkingStrategyVersion: deps.chunkingStrategyVersion ?? DEFAULT_CHUNKING_STRATEGY_KEY,
|
|
521
|
+
absolutePath: resolved.absolutePath,
|
|
522
|
+
relativePath: resolved.relativePath,
|
|
523
|
+
...(options.signal === undefined ? {} : { signal: options.signal }),
|
|
524
|
+
};
|
|
525
|
+
}
|
|
526
|
+
const PROGRESSIVE_HASH_CHUNK_BYTES = 4 * 1024 * 1024;
|
|
527
|
+
async function readRange(deps, params, target, startByte, length) {
|
|
528
|
+
const reader = deps.fs.readFileRange;
|
|
529
|
+
if (reader === undefined) {
|
|
530
|
+
return {
|
|
531
|
+
code: "READ_FAILED",
|
|
532
|
+
message: "WorkspaceFs.readFileRange is unavailable",
|
|
533
|
+
relativePath: params.file.relativePath,
|
|
534
|
+
};
|
|
535
|
+
}
|
|
536
|
+
const requestedError = validateRequestedTarget(deps, params, target);
|
|
537
|
+
if (requestedError !== undefined)
|
|
538
|
+
return requestedError;
|
|
539
|
+
const resolvedError = validateResolvedTarget(deps, params, target);
|
|
540
|
+
if (resolvedError !== undefined)
|
|
541
|
+
return resolvedError;
|
|
542
|
+
try {
|
|
543
|
+
return await reader(target.absolutePath, startByte, length);
|
|
544
|
+
}
|
|
545
|
+
catch {
|
|
546
|
+
return {
|
|
547
|
+
code: "READ_FAILED",
|
|
548
|
+
message: "readFileRange failed for selected file",
|
|
549
|
+
relativePath: params.file.relativePath,
|
|
550
|
+
};
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
function progressiveRangeSource(deps, params, target) {
|
|
554
|
+
if (deps.fs.readFileRange === undefined) {
|
|
555
|
+
return {
|
|
556
|
+
code: "READ_FAILED",
|
|
557
|
+
message: "WorkspaceFs.readFileRange is unavailable",
|
|
558
|
+
relativePath: params.file.relativePath,
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
return {
|
|
562
|
+
totalBytes: params.file.sizeBytes,
|
|
563
|
+
readWindow: async (startByte, length) => {
|
|
564
|
+
const bytes = await readRange(deps, params, target, startByte, length);
|
|
565
|
+
if (bytes instanceof Uint8Array)
|
|
566
|
+
return bytes;
|
|
567
|
+
throw new Error(bytes.message);
|
|
568
|
+
},
|
|
569
|
+
};
|
|
570
|
+
}
|
|
571
|
+
async function hashProgressiveSource(source) {
|
|
572
|
+
if (source.readWindow === undefined) {
|
|
573
|
+
throw new Error("progressive source does not support bounded range reads");
|
|
574
|
+
}
|
|
575
|
+
const hash = createHash("sha256");
|
|
576
|
+
for (let offset = 0; offset < source.totalBytes; offset += PROGRESSIVE_HASH_CHUNK_BYTES) {
|
|
577
|
+
const bytes = await source.readWindow(offset, Math.min(PROGRESSIVE_HASH_CHUNK_BYTES, source.totalBytes - offset));
|
|
578
|
+
if (bytes.byteLength === 0)
|
|
579
|
+
break;
|
|
580
|
+
hash.update(bytes);
|
|
581
|
+
}
|
|
582
|
+
return hash.digest("hex");
|
|
583
|
+
}
|
|
584
|
+
// Legacy binary office formats (.doc/.ppt/.xls) get the existing unsupported path plus a stable
|
|
585
|
+
// CONVERTER_UNAVAILABLE diagnostic with actionable guidance, leaving the job stable.
|
|
586
|
+
function appendLegacyDiagnostic(deps, params, documentId, extension, result) {
|
|
587
|
+
const diagnostic = legacyFormatDiagnostic(extension, documentId);
|
|
588
|
+
if (diagnostic === undefined)
|
|
589
|
+
return result;
|
|
590
|
+
insertDiagnosticRow(deps.store._internal.db, {
|
|
591
|
+
id: `${String(documentId)}#legacy`,
|
|
592
|
+
capsuleId: params.capsuleId,
|
|
593
|
+
diagnostic,
|
|
594
|
+
createdAt: deps.store._internal.now(),
|
|
595
|
+
});
|
|
596
|
+
return { ...result, diagnostics: [...result.diagnostics, diagnostic] };
|
|
597
|
+
}
|
|
598
|
+
// ─── Top-level entry point ───────────────────────────────────────────────────
|
|
599
|
+
function selectionInput(documentId, relativePath, bytes) {
|
|
600
|
+
const extension = extensionOf(relativePath);
|
|
601
|
+
return {
|
|
602
|
+
documentId,
|
|
603
|
+
bytes,
|
|
604
|
+
extension,
|
|
605
|
+
mediaType: mediaTypeFor(extension),
|
|
606
|
+
};
|
|
607
|
+
}
|
|
608
|
+
function isUnsupportedResult(result) {
|
|
609
|
+
return (result.parser.parserId === "unsupported" ||
|
|
610
|
+
(result.units.length > 0 && result.units.every((unit) => unit.kind === "unsupported-media")));
|
|
611
|
+
}
|
|
612
|
+
const FAILED_PARSER_DIAGNOSTIC_CODES = new Set([
|
|
613
|
+
"OVERSIZED_FILE",
|
|
614
|
+
"PARSER_TIMEOUT",
|
|
615
|
+
"PARSER_CANCELLED",
|
|
616
|
+
"MALFORMED_INPUT",
|
|
617
|
+
"OBJECT_LIMIT_REACHED",
|
|
618
|
+
]);
|
|
619
|
+
function firstParserFailureDiagnostic(result) {
|
|
620
|
+
return result.diagnostics.find((diagnostic) => diagnostic.severity === "error" || FAILED_PARSER_DIAGNOSTIC_CODES.has(diagnostic.code));
|
|
621
|
+
}
|
|
622
|
+
function statusForResult(result) {
|
|
623
|
+
if (isUnsupportedResult(result))
|
|
624
|
+
return "unsupported";
|
|
625
|
+
if (firstParserFailureDiagnostic(result) !== undefined)
|
|
626
|
+
return "failed";
|
|
627
|
+
return "extracted";
|
|
628
|
+
}
|
|
629
|
+
function discoveryErrorCodeForParserDiagnostic(diagnostic) {
|
|
630
|
+
if (diagnostic.code === "OVERSIZED_FILE")
|
|
631
|
+
return "OVERSIZED_FILE";
|
|
632
|
+
if (diagnostic.code === "PARSER_CANCELLED")
|
|
633
|
+
return "CANCELLED";
|
|
634
|
+
if (diagnostic.code === "MALFORMED_INPUT")
|
|
635
|
+
return "MALFORMED_INPUT";
|
|
636
|
+
if (diagnostic.code === "PARSER_TIMEOUT")
|
|
637
|
+
return "PARSER_TIMEOUT";
|
|
638
|
+
return "PARSER_FAILED";
|
|
639
|
+
}
|
|
640
|
+
function parserFailureOutcome(document, diagnostic, relativePath) {
|
|
641
|
+
return {
|
|
642
|
+
kind: "failed",
|
|
643
|
+
document,
|
|
644
|
+
error: {
|
|
645
|
+
code: discoveryErrorCodeForParserDiagnostic(diagnostic),
|
|
646
|
+
message: diagnostic.message,
|
|
647
|
+
relativePath,
|
|
648
|
+
},
|
|
649
|
+
};
|
|
650
|
+
}
|
|
651
|
+
const SOURCE_TEXT_PARSER_IDS = new Set(["text", "json", "csv", "html"]);
|
|
652
|
+
function decodeUtf8ForStorage(bytes) {
|
|
653
|
+
const raw = new TextDecoder("utf-8", { fatal: false }).decode(bytes);
|
|
654
|
+
return raw.length > 0 && raw.charCodeAt(0) === 0xfeff ? raw.slice(1) : raw;
|
|
655
|
+
}
|
|
656
|
+
function normalizedTextForPersistence(parserResult, bytes) {
|
|
657
|
+
if (parserResult.normalizedText !== undefined) {
|
|
658
|
+
return parserResult.normalizedText;
|
|
659
|
+
}
|
|
660
|
+
if (!SOURCE_TEXT_PARSER_IDS.has(parserResult.parser.parserId)) {
|
|
661
|
+
return undefined;
|
|
662
|
+
}
|
|
663
|
+
if (parserResult.units.length === 0) {
|
|
664
|
+
return undefined;
|
|
665
|
+
}
|
|
666
|
+
return decodeUtf8ForStorage(bytes);
|
|
667
|
+
}
|
|
668
|
+
function withPersistedNormalizedText(parserResult, bytes) {
|
|
669
|
+
const normalizedText = normalizedTextForPersistence(parserResult, bytes);
|
|
670
|
+
return normalizedText === undefined ? parserResult : { ...parserResult, normalizedText };
|
|
671
|
+
}
|
|
672
|
+
function hasAsyncParse(adapter) {
|
|
673
|
+
return typeof adapter.parseAsync === "function";
|
|
674
|
+
}
|
|
675
|
+
async function runParser(deps, documentId, params, bytes, options) {
|
|
676
|
+
const input = selectionInput(documentId, params.file.relativePath, bytes);
|
|
677
|
+
const resolution = deps.parserRegistry.resolve(input);
|
|
678
|
+
const adapter = resolution.kind === "matched" ? resolution.adapter : unsupportedParser;
|
|
679
|
+
if (hasAsyncParse(adapter)) {
|
|
680
|
+
return adapter.parseAsync(input, options);
|
|
681
|
+
}
|
|
682
|
+
return adapter.parse(input, options);
|
|
683
|
+
}
|
|
684
|
+
async function runParserForPersistence(deps, documentId, params, bytes, options) {
|
|
685
|
+
const result = await runParser(deps, documentId, params, bytes, options);
|
|
686
|
+
return withPersistedNormalizedText(result, bytes);
|
|
687
|
+
}
|
|
688
|
+
function persistExtractedDocument(deps, params, documentId, document, parserResult) {
|
|
689
|
+
persistDocumentAndDependents(deps, params, documentId, document, parserResult, deps.store._internal.now);
|
|
690
|
+
}
|
|
691
|
+
async function readBoundedDocumentBytes(deps, params, documentId, target, options) {
|
|
692
|
+
const bytes = await readBytes(deps, params, target, options.maxBytes + 1);
|
|
693
|
+
if (!(bytes instanceof Uint8Array)) {
|
|
694
|
+
return buildFailureResult(deps, params, documentId, bytes);
|
|
695
|
+
}
|
|
696
|
+
if (bytes.byteLength > options.maxBytes) {
|
|
697
|
+
return buildOversizedFailure(deps, params, documentId, options, bytes.byteLength);
|
|
698
|
+
}
|
|
699
|
+
return bytes;
|
|
700
|
+
}
|
|
701
|
+
function parserExtractionResult(params, document, parserResult, status) {
|
|
702
|
+
const failureDiagnostic = firstParserFailureDiagnostic(parserResult);
|
|
703
|
+
return {
|
|
704
|
+
capsuleId: params.capsuleId,
|
|
705
|
+
sourceId: params.source.id,
|
|
706
|
+
relativePath: params.file.relativePath,
|
|
707
|
+
outcome: status === "failed" && failureDiagnostic !== undefined
|
|
708
|
+
? parserFailureOutcome(document, failureDiagnostic, params.file.relativePath)
|
|
709
|
+
: { kind: "persisted", document },
|
|
710
|
+
diagnostics: parserResult.diagnostics,
|
|
711
|
+
};
|
|
712
|
+
}
|
|
713
|
+
function paramsWithRelativePath(params, relativePath) {
|
|
714
|
+
if (params.file.relativePath === relativePath)
|
|
715
|
+
return params;
|
|
716
|
+
return { ...params, file: { ...params.file, relativePath } };
|
|
717
|
+
}
|
|
718
|
+
function extractionDocumentId(params) {
|
|
719
|
+
return documentIdFor({
|
|
720
|
+
capsuleId: params.capsuleId,
|
|
721
|
+
sourceId: params.source.id,
|
|
722
|
+
relativePath: params.file.relativePath,
|
|
723
|
+
});
|
|
724
|
+
}
|
|
725
|
+
function targetResolutionFailure(deps, params, resolved) {
|
|
726
|
+
const failureParams = resolved.error.relativePath === undefined
|
|
727
|
+
? params
|
|
728
|
+
: paramsWithRelativePath(params, resolved.error.relativePath);
|
|
729
|
+
return buildFailureResult(deps, failureParams, extractionDocumentId(failureParams), resolved.error, {
|
|
730
|
+
persist: resolved.persistFailure,
|
|
731
|
+
});
|
|
732
|
+
}
|
|
733
|
+
async function parseAndPersistDocument(deps, params, documentId, bytes, contentHash, options) {
|
|
734
|
+
let parserResult;
|
|
735
|
+
try {
|
|
736
|
+
parserResult = await runParserForPersistence(deps, documentId, params, bytes, options);
|
|
737
|
+
}
|
|
738
|
+
catch {
|
|
739
|
+
return buildFailureResult(deps, params, documentId, {
|
|
740
|
+
code: "PARSER_FAILED",
|
|
741
|
+
message: "parser adapter failed while extracting document",
|
|
742
|
+
relativePath: params.file.relativePath,
|
|
743
|
+
});
|
|
744
|
+
}
|
|
745
|
+
const redactedParserResult = redactParserResult(parserResult, params.source);
|
|
746
|
+
const status = statusForResult(redactedParserResult);
|
|
747
|
+
const document = buildDocumentRecord({
|
|
748
|
+
documentId,
|
|
749
|
+
params,
|
|
750
|
+
mediaType: mediaTypeFor(extensionOf(params.file.relativePath)),
|
|
751
|
+
contentHash,
|
|
752
|
+
parserResult: redactedParserResult,
|
|
753
|
+
status,
|
|
754
|
+
});
|
|
755
|
+
persistExtractedDocument(deps, params, documentId, document, redactedParserResult);
|
|
756
|
+
return parserExtractionResult(params, document, redactedParserResult, status);
|
|
757
|
+
}
|
|
758
|
+
async function progressiveExtractionResult(deps, params, resolved, documentId, options, extension, mediaType) {
|
|
759
|
+
const context = largeDocumentContextFor(deps, resolved, options);
|
|
760
|
+
const preflight = classifyLargeDocument({ extension, mediaType, sizeBytes: params.file.sizeBytes }, context.policy);
|
|
761
|
+
if (preflight.decision === "reject-oversized") {
|
|
762
|
+
return buildOversizedFailure(deps, params, documentId, {
|
|
763
|
+
...options,
|
|
764
|
+
maxBytes: context.policy.maxRawFileBytes,
|
|
765
|
+
});
|
|
766
|
+
}
|
|
767
|
+
const extractor = params.file.sizeBytes >= context.policy.largeFileThresholdBytes
|
|
768
|
+
? selectProgressiveExtractor(context, extension, mediaType)
|
|
769
|
+
: undefined;
|
|
770
|
+
if (!usesProgressivePath(preflight) && extractor === undefined)
|
|
771
|
+
return undefined;
|
|
772
|
+
if (extractor === undefined)
|
|
773
|
+
return undefined;
|
|
774
|
+
const source = progressiveRangeSource(deps, params, resolved);
|
|
775
|
+
if (!("totalBytes" in source))
|
|
776
|
+
return buildFailureResult(deps, params, documentId, source);
|
|
777
|
+
let contentHash;
|
|
778
|
+
try {
|
|
779
|
+
contentHash = await hashProgressiveSource(source);
|
|
780
|
+
}
|
|
781
|
+
catch {
|
|
782
|
+
return buildFailureResult(deps, params, documentId, {
|
|
783
|
+
code: "READ_FAILED",
|
|
784
|
+
message: "readFileRange failed for selected file",
|
|
785
|
+
relativePath: params.file.relativePath,
|
|
786
|
+
});
|
|
787
|
+
}
|
|
788
|
+
const fast = readUnchangedFastPath(deps, params, documentId, contentHash);
|
|
789
|
+
return (fast ??
|
|
790
|
+
(await extractDocumentProgressive(deps, params, context, source, contentHash, extractor)));
|
|
791
|
+
}
|
|
792
|
+
async function standardExtractionResult(deps, params, resolved, documentId, options, extension) {
|
|
793
|
+
if (params.file.sizeBytes > options.maxBytes) {
|
|
794
|
+
return buildOversizedFailure(deps, params, documentId, options);
|
|
795
|
+
}
|
|
796
|
+
const bytes = await readBoundedDocumentBytes(deps, params, documentId, resolved, options);
|
|
797
|
+
if (!(bytes instanceof Uint8Array))
|
|
798
|
+
return bytes;
|
|
799
|
+
const contentHash = hashBytes(bytes);
|
|
800
|
+
const fast = readUnchangedFastPath(deps, params, documentId, contentHash);
|
|
801
|
+
if (fast !== undefined)
|
|
802
|
+
return fast;
|
|
803
|
+
const result = await parseAndPersistDocument(deps, params, documentId, bytes, contentHash, options);
|
|
804
|
+
return isLegacyBinaryOfficeFormat(extension)
|
|
805
|
+
? appendLegacyDiagnostic(deps, params, documentId, extension, result)
|
|
806
|
+
: result;
|
|
807
|
+
}
|
|
808
|
+
export async function extractDocument(deps, params) {
|
|
809
|
+
const resolved = resolveTargetPath(deps, params);
|
|
810
|
+
if ("error" in resolved) {
|
|
811
|
+
return targetResolutionFailure(deps, params, resolved);
|
|
812
|
+
}
|
|
813
|
+
const canonicalParams = paramsWithRelativePath(params, resolved.relativePath);
|
|
814
|
+
const documentId = extractionDocumentId(canonicalParams);
|
|
815
|
+
const options = canonicalParams.parserOptions ?? buildParserOptions();
|
|
816
|
+
const extension = extensionOf(canonicalParams.file.relativePath);
|
|
817
|
+
const mediaType = mediaTypeFor(extension);
|
|
818
|
+
const progressive = await progressiveExtractionResult(deps, canonicalParams, resolved, documentId, options, extension, mediaType);
|
|
819
|
+
if (progressive !== undefined)
|
|
820
|
+
return progressive;
|
|
821
|
+
return standardExtractionResult(deps, canonicalParams, resolved, documentId, options, extension);
|
|
822
|
+
}
|
|
823
|
+
export function recordExtractionFailure(deps, params) {
|
|
824
|
+
const documentId = documentIdFor({
|
|
825
|
+
capsuleId: params.capsuleId,
|
|
826
|
+
sourceId: params.source.id,
|
|
827
|
+
relativePath: params.file.relativePath,
|
|
828
|
+
});
|
|
829
|
+
return buildFailureResult(deps, params, documentId, params.error);
|
|
830
|
+
}
|
|
831
|
+
function oversizedDocumentRecord(params, documentId, lastExtractedAt, observedSizeBytes) {
|
|
832
|
+
const sizeBytes = observedSizeBytes === undefined
|
|
833
|
+
? params.file.sizeBytes
|
|
834
|
+
: Math.max(params.file.sizeBytes, observedSizeBytes);
|
|
835
|
+
return {
|
|
836
|
+
id: documentId,
|
|
837
|
+
capsuleId: params.capsuleId,
|
|
838
|
+
sourceId: params.source.id,
|
|
839
|
+
documentPath: params.file.relativePath,
|
|
840
|
+
sizeBytes,
|
|
841
|
+
mediaType: mediaTypeFor(extensionOf(params.file.relativePath)),
|
|
842
|
+
contentHash: "",
|
|
843
|
+
parser: { parserId: "none", parserVersion: "0" },
|
|
844
|
+
lastExtractedAt,
|
|
845
|
+
status: "failed",
|
|
846
|
+
safeDisplayName: safeDisplay(params.file.relativePath),
|
|
847
|
+
};
|
|
848
|
+
}
|
|
849
|
+
function persistOversizedRow(deps, params, documentId, document, diagnostic, now) {
|
|
850
|
+
const db = deps.store._internal.db;
|
|
851
|
+
db.exec("BEGIN");
|
|
852
|
+
try {
|
|
853
|
+
insertDocumentRow(db, {
|
|
854
|
+
id: documentId,
|
|
855
|
+
capsuleId: params.capsuleId,
|
|
856
|
+
sourceId: String(params.source.id),
|
|
857
|
+
documentPath: document.documentPath,
|
|
858
|
+
sizeBytes: document.sizeBytes,
|
|
859
|
+
mediaType: document.mediaType,
|
|
860
|
+
contentHash: document.contentHash,
|
|
861
|
+
parserId: document.parser.parserId,
|
|
862
|
+
parserVersion: document.parser.parserVersion,
|
|
863
|
+
lastExtractedAt: document.lastExtractedAt,
|
|
864
|
+
status: document.status,
|
|
865
|
+
safeDisplayName: document.safeDisplayName,
|
|
866
|
+
});
|
|
867
|
+
deleteDependentRows(db, params.capsuleId, documentId);
|
|
868
|
+
insertDiagnosticRow(db, {
|
|
869
|
+
id: `${String(documentId)}#d0`,
|
|
870
|
+
capsuleId: params.capsuleId,
|
|
871
|
+
diagnostic,
|
|
872
|
+
createdAt: now(),
|
|
873
|
+
});
|
|
874
|
+
db.exec("COMMIT");
|
|
875
|
+
}
|
|
876
|
+
catch (cause) {
|
|
877
|
+
db.exec("ROLLBACK");
|
|
878
|
+
throw cause;
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
function buildOversizedFailure(deps, params, documentId, options, observedSizeBytes) {
|
|
882
|
+
const now = deps.store._internal.now;
|
|
883
|
+
const sizeBytes = observedSizeBytes === undefined
|
|
884
|
+
? params.file.sizeBytes
|
|
885
|
+
: Math.max(params.file.sizeBytes, observedSizeBytes);
|
|
886
|
+
const message = redactMessage(`file size ${String(sizeBytes)} exceeds maxBytes=${String(options.maxBytes)}`, params.source);
|
|
887
|
+
const diagnostic = {
|
|
888
|
+
severity: "error",
|
|
889
|
+
code: "OVERSIZED_FILE",
|
|
890
|
+
message,
|
|
891
|
+
documentId,
|
|
892
|
+
};
|
|
893
|
+
const document = oversizedDocumentRecord(params, documentId, now(), observedSizeBytes);
|
|
894
|
+
persistOversizedRow(deps, params, documentId, document, diagnostic, now);
|
|
895
|
+
return {
|
|
896
|
+
capsuleId: params.capsuleId,
|
|
897
|
+
sourceId: params.source.id,
|
|
898
|
+
relativePath: params.file.relativePath,
|
|
899
|
+
outcome: {
|
|
900
|
+
kind: "failed",
|
|
901
|
+
document,
|
|
902
|
+
error: { code: "OVERSIZED_FILE", message },
|
|
903
|
+
},
|
|
904
|
+
diagnostics: [diagnostic],
|
|
905
|
+
};
|
|
906
|
+
}
|