@oscharko-dev/keiko-local-knowledge 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/dist/.tsbuildinfo +1 -0
  2. package/dist/bounded-document-extraction.d.ts +27 -0
  3. package/dist/bounded-document-extraction.d.ts.map +1 -0
  4. package/dist/bounded-document-extraction.js +214 -0
  5. package/dist/capsule-lifecycle.d.ts +33 -0
  6. package/dist/capsule-lifecycle.d.ts.map +1 -0
  7. package/dist/capsule-lifecycle.js +292 -0
  8. package/dist/capsule-set-lifecycle.d.ts +15 -0
  9. package/dist/capsule-set-lifecycle.d.ts.map +1 -0
  10. package/dist/capsule-set-lifecycle.js +158 -0
  11. package/dist/chunking/chunker-persist.d.ts +36 -0
  12. package/dist/chunking/chunker-persist.d.ts.map +1 -0
  13. package/dist/chunking/chunker-persist.js +74 -0
  14. package/dist/chunking/chunker-runner.d.ts +9 -0
  15. package/dist/chunking/chunker-runner.d.ts.map +1 -0
  16. package/dist/chunking/chunker-runner.js +218 -0
  17. package/dist/chunking/chunker.d.ts +7 -0
  18. package/dist/chunking/chunker.d.ts.map +1 -0
  19. package/dist/chunking/chunker.js +139 -0
  20. package/dist/chunking/citation-mapper.d.ts +4 -0
  21. package/dist/chunking/citation-mapper.d.ts.map +1 -0
  22. package/dist/chunking/citation-mapper.js +180 -0
  23. package/dist/chunking/index.d.ts +6 -0
  24. package/dist/chunking/index.d.ts.map +1 -0
  25. package/dist/chunking/index.js +8 -0
  26. package/dist/chunking/token-estimator.d.ts +3 -0
  27. package/dist/chunking/token-estimator.d.ts.map +1 -0
  28. package/dist/chunking/token-estimator.js +26 -0
  29. package/dist/chunking/types.d.ts +49 -0
  30. package/dist/chunking/types.d.ts.map +1 -0
  31. package/dist/chunking/types.js +26 -0
  32. package/dist/composition.d.ts +57 -0
  33. package/dist/composition.d.ts.map +1 -0
  34. package/dist/composition.js +310 -0
  35. package/dist/conversation/citation-attacher.d.ts +8 -0
  36. package/dist/conversation/citation-attacher.d.ts.map +1 -0
  37. package/dist/conversation/citation-attacher.js +55 -0
  38. package/dist/conversation/citation-excerpts.d.ts +4 -0
  39. package/dist/conversation/citation-excerpts.d.ts.map +1 -0
  40. package/dist/conversation/citation-excerpts.js +41 -0
  41. package/dist/conversation/grounded-answer-runner.d.ts +9 -0
  42. package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
  43. package/dist/conversation/grounded-answer-runner.js +61 -0
  44. package/dist/conversation/index.d.ts +5 -0
  45. package/dist/conversation/index.d.ts.map +1 -0
  46. package/dist/conversation/index.js +7 -0
  47. package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
  48. package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
  49. package/dist/conversation/model-gateway-answer-generator.js +105 -0
  50. package/dist/conversation/types.d.ts +35 -0
  51. package/dist/conversation/types.d.ts.map +1 -0
  52. package/dist/conversation/types.js +24 -0
  53. package/dist/discovery/discovery-runner.d.ts +23 -0
  54. package/dist/discovery/discovery-runner.d.ts.map +1 -0
  55. package/dist/discovery/discovery-runner.js +109 -0
  56. package/dist/discovery/extract-progressive.d.ts +17 -0
  57. package/dist/discovery/extract-progressive.d.ts.map +1 -0
  58. package/dist/discovery/extract-progressive.js +522 -0
  59. package/dist/discovery/extract.d.ts +26 -0
  60. package/dist/discovery/extract.d.ts.map +1 -0
  61. package/dist/discovery/extract.js +906 -0
  62. package/dist/discovery/glob.d.ts +10 -0
  63. package/dist/discovery/glob.d.ts.map +1 -0
  64. package/dist/discovery/glob.js +72 -0
  65. package/dist/discovery/index.d.ts +6 -0
  66. package/dist/discovery/index.d.ts.map +1 -0
  67. package/dist/discovery/index.js +8 -0
  68. package/dist/discovery/media-type.d.ts +4 -0
  69. package/dist/discovery/media-type.d.ts.map +1 -0
  70. package/dist/discovery/media-type.js +62 -0
  71. package/dist/discovery/persist.d.ts +63 -0
  72. package/dist/discovery/persist.d.ts.map +1 -0
  73. package/dist/discovery/persist.js +345 -0
  74. package/dist/discovery/test-support.d.ts +16 -0
  75. package/dist/discovery/test-support.d.ts.map +1 -0
  76. package/dist/discovery/test-support.js +127 -0
  77. package/dist/discovery/types.d.ts +63 -0
  78. package/dist/discovery/types.d.ts.map +1 -0
  79. package/dist/discovery/types.js +28 -0
  80. package/dist/discovery/walk.d.ts +12 -0
  81. package/dist/discovery/walk.d.ts.map +1 -0
  82. package/dist/discovery/walk.js +302 -0
  83. package/dist/errors.d.ts +13 -0
  84. package/dist/errors.d.ts.map +1 -0
  85. package/dist/errors.js +22 -0
  86. package/dist/evaluations/dimensions.d.ts +14 -0
  87. package/dist/evaluations/dimensions.d.ts.map +1 -0
  88. package/dist/evaluations/dimensions.js +191 -0
  89. package/dist/evaluations/fixtures.d.ts +18 -0
  90. package/dist/evaluations/fixtures.d.ts.map +1 -0
  91. package/dist/evaluations/fixtures.js +858 -0
  92. package/dist/evaluations/index.d.ts +7 -0
  93. package/dist/evaluations/index.d.ts.map +1 -0
  94. package/dist/evaluations/index.js +10 -0
  95. package/dist/evaluations/report.d.ts +3 -0
  96. package/dist/evaluations/report.d.ts.map +1 -0
  97. package/dist/evaluations/report.js +31 -0
  98. package/dist/evaluations/runner-seed.d.ts +12 -0
  99. package/dist/evaluations/runner-seed.d.ts.map +1 -0
  100. package/dist/evaluations/runner-seed.js +175 -0
  101. package/dist/evaluations/runner.d.ts +8 -0
  102. package/dist/evaluations/runner.d.ts.map +1 -0
  103. package/dist/evaluations/runner.js +205 -0
  104. package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
  105. package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
  106. package/dist/evaluations/scripted-embedding-adapter.js +163 -0
  107. package/dist/evaluations/types.d.ts +116 -0
  108. package/dist/evaluations/types.d.ts.map +1 -0
  109. package/dist/evaluations/types.js +27 -0
  110. package/dist/index.d.ts +23 -0
  111. package/dist/index.d.ts.map +1 -0
  112. package/dist/index.js +41 -0
  113. package/dist/indexing/bounded-indexing.d.ts +41 -0
  114. package/dist/indexing/bounded-indexing.d.ts.map +1 -0
  115. package/dist/indexing/bounded-indexing.js +240 -0
  116. package/dist/indexing/checkpoint-persist.d.ts +8 -0
  117. package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
  118. package/dist/indexing/checkpoint-persist.js +135 -0
  119. package/dist/indexing/checkpoint-resume.d.ts +20 -0
  120. package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
  121. package/dist/indexing/checkpoint-resume.js +50 -0
  122. package/dist/indexing/embedding-batcher.d.ts +3 -0
  123. package/dist/indexing/embedding-batcher.d.ts.map +1 -0
  124. package/dist/indexing/embedding-batcher.js +390 -0
  125. package/dist/indexing/index.d.ts +7 -0
  126. package/dist/indexing/index.d.ts.map +1 -0
  127. package/dist/indexing/index.js +11 -0
  128. package/dist/indexing/job-persist.d.ts +46 -0
  129. package/dist/indexing/job-persist.d.ts.map +1 -0
  130. package/dist/indexing/job-persist.js +157 -0
  131. package/dist/indexing/job-resume.d.ts +4 -0
  132. package/dist/indexing/job-resume.d.ts.map +1 -0
  133. package/dist/indexing/job-resume.js +14 -0
  134. package/dist/indexing/orchestrator.d.ts +3 -0
  135. package/dist/indexing/orchestrator.d.ts.map +1 -0
  136. package/dist/indexing/orchestrator.js +1151 -0
  137. package/dist/indexing/types.d.ts +156 -0
  138. package/dist/indexing/types.d.ts.map +1 -0
  139. package/dist/indexing/types.js +30 -0
  140. package/dist/indexing/vector-persist.d.ts +32 -0
  141. package/dist/indexing/vector-persist.d.ts.map +1 -0
  142. package/dist/indexing/vector-persist.js +105 -0
  143. package/dist/parsers/_internal.d.ts +20 -0
  144. package/dist/parsers/_internal.d.ts.map +1 -0
  145. package/dist/parsers/_internal.js +122 -0
  146. package/dist/parsers/csv-parser.d.ts +3 -0
  147. package/dist/parsers/csv-parser.d.ts.map +1 -0
  148. package/dist/parsers/csv-parser.js +202 -0
  149. package/dist/parsers/docx-parser.d.ts +3 -0
  150. package/dist/parsers/docx-parser.d.ts.map +1 -0
  151. package/dist/parsers/docx-parser.js +390 -0
  152. package/dist/parsers/html-parser.d.ts +3 -0
  153. package/dist/parsers/html-parser.d.ts.map +1 -0
  154. package/dist/parsers/html-parser.js +310 -0
  155. package/dist/parsers/index.d.ts +15 -0
  156. package/dist/parsers/index.d.ts.map +1 -0
  157. package/dist/parsers/index.js +41 -0
  158. package/dist/parsers/json-parser.d.ts +3 -0
  159. package/dist/parsers/json-parser.d.ts.map +1 -0
  160. package/dist/parsers/json-parser.js +192 -0
  161. package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
  162. package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
  163. package/dist/parsers/large-document/capability-discovery.js +76 -0
  164. package/dist/parsers/large-document/diagnostics.d.ts +3 -0
  165. package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
  166. package/dist/parsers/large-document/diagnostics.js +11 -0
  167. package/dist/parsers/large-document/index.d.ts +15 -0
  168. package/dist/parsers/large-document/index.d.ts.map +1 -0
  169. package/dist/parsers/large-document/index.js +10 -0
  170. package/dist/parsers/large-document/legacy-format.d.ts +5 -0
  171. package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
  172. package/dist/parsers/large-document/legacy-format.js +25 -0
  173. package/dist/parsers/large-document/preflight.d.ts +9 -0
  174. package/dist/parsers/large-document/preflight.d.ts.map +1 -0
  175. package/dist/parsers/large-document/preflight.js +43 -0
  176. package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
  177. package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
  178. package/dist/parsers/large-document/progressive-extraction.js +123 -0
  179. package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
  180. package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
  181. package/dist/parsers/large-document/progressive-pdf.js +145 -0
  182. package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
  183. package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
  184. package/dist/parsers/large-document/synthetic-source.js +101 -0
  185. package/dist/parsers/large-document/window-builder.d.ts +24 -0
  186. package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
  187. package/dist/parsers/large-document/window-builder.js +75 -0
  188. package/dist/parsers/ocr/index.d.ts +4 -0
  189. package/dist/parsers/ocr/index.d.ts.map +1 -0
  190. package/dist/parsers/ocr/index.js +4 -0
  191. package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
  192. package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
  193. package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
  194. package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
  195. package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
  196. package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
  197. package/dist/parsers/ocr/types.d.ts +16 -0
  198. package/dist/parsers/ocr/types.d.ts.map +1 -0
  199. package/dist/parsers/ocr/types.js +4 -0
  200. package/dist/parsers/parser-test-fixtures.d.ts +28 -0
  201. package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
  202. package/dist/parsers/parser-test-fixtures.js +139 -0
  203. package/dist/parsers/pdf-parser.d.ts +43 -0
  204. package/dist/parsers/pdf-parser.d.ts.map +1 -0
  205. package/dist/parsers/pdf-parser.js +388 -0
  206. package/dist/parsers/registry.d.ts +8 -0
  207. package/dist/parsers/registry.d.ts.map +1 -0
  208. package/dist/parsers/registry.js +57 -0
  209. package/dist/parsers/text-parser.d.ts +3 -0
  210. package/dist/parsers/text-parser.d.ts.map +1 -0
  211. package/dist/parsers/text-parser.js +214 -0
  212. package/dist/parsers/types.d.ts +53 -0
  213. package/dist/parsers/types.d.ts.map +1 -0
  214. package/dist/parsers/types.js +21 -0
  215. package/dist/parsers/unsupported-parser.d.ts +4 -0
  216. package/dist/parsers/unsupported-parser.d.ts.map +1 -0
  217. package/dist/parsers/unsupported-parser.js +97 -0
  218. package/dist/parsers/xlsx-parser.d.ts +3 -0
  219. package/dist/parsers/xlsx-parser.d.ts.map +1 -0
  220. package/dist/parsers/xlsx-parser.js +425 -0
  221. package/dist/privacy/audit-emitter.d.ts +5 -0
  222. package/dist/privacy/audit-emitter.d.ts.map +1 -0
  223. package/dist/privacy/audit-emitter.js +93 -0
  224. package/dist/privacy/diagnostic-redactor.d.ts +2 -0
  225. package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
  226. package/dist/privacy/diagnostic-redactor.js +153 -0
  227. package/dist/privacy/index.d.ts +5 -0
  228. package/dist/privacy/index.d.ts.map +1 -0
  229. package/dist/privacy/index.js +6 -0
  230. package/dist/privacy/retention-applier.d.ts +5 -0
  231. package/dist/privacy/retention-applier.d.ts.map +1 -0
  232. package/dist/privacy/retention-applier.js +88 -0
  233. package/dist/privacy/types.d.ts +98 -0
  234. package/dist/privacy/types.d.ts.map +1 -0
  235. package/dist/privacy/types.js +12 -0
  236. package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
  237. package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
  238. package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
  239. package/dist/qualityIntelligence/index.d.ts +3 -0
  240. package/dist/qualityIntelligence/index.d.ts.map +1 -0
  241. package/dist/qualityIntelligence/index.js +5 -0
  242. package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
  243. package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
  244. package/dist/qualityIntelligence/qiHandoff.js +82 -0
  245. package/dist/retrieval/answer-grounding.d.ts +9 -0
  246. package/dist/retrieval/answer-grounding.d.ts.map +1 -0
  247. package/dist/retrieval/answer-grounding.js +31 -0
  248. package/dist/retrieval/context-pack-assembler.d.ts +24 -0
  249. package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
  250. package/dist/retrieval/context-pack-assembler.js +50 -0
  251. package/dist/retrieval/index.d.ts +6 -0
  252. package/dist/retrieval/index.d.ts.map +1 -0
  253. package/dist/retrieval/index.js +9 -0
  254. package/dist/retrieval/retrieval-runner.d.ts +10 -0
  255. package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
  256. package/dist/retrieval/retrieval-runner.js +163 -0
  257. package/dist/retrieval/scoped-vector-search.d.ts +24 -0
  258. package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
  259. package/dist/retrieval/scoped-vector-search.js +864 -0
  260. package/dist/retrieval/types.d.ts +28 -0
  261. package/dist/retrieval/types.d.ts.map +1 -0
  262. package/dist/retrieval/types.js +33 -0
  263. package/dist/section-path-hash.d.ts +3 -0
  264. package/dist/section-path-hash.d.ts.map +1 -0
  265. package/dist/section-path-hash.js +9 -0
  266. package/dist/source-lifecycle.d.ts +14 -0
  267. package/dist/source-lifecycle.d.ts.map +1 -0
  268. package/dist/source-lifecycle.js +155 -0
  269. package/dist/source-routing-validation.d.ts +11 -0
  270. package/dist/source-routing-validation.d.ts.map +1 -0
  271. package/dist/source-routing-validation.js +140 -0
  272. package/dist/store-content-cipher.d.ts +11 -0
  273. package/dist/store-content-cipher.d.ts.map +1 -0
  274. package/dist/store-content-cipher.js +67 -0
  275. package/dist/store-content-encryption.d.ts +12 -0
  276. package/dist/store-content-encryption.d.ts.map +1 -0
  277. package/dist/store-content-encryption.js +275 -0
  278. package/dist/store-paths.d.ts +6 -0
  279. package/dist/store-paths.d.ts.map +1 -0
  280. package/dist/store-paths.js +61 -0
  281. package/dist/store.d.ts +30 -0
  282. package/dist/store.d.ts.map +1 -0
  283. package/dist/store.js +219 -0
  284. package/dist/testing.d.ts +47 -0
  285. package/dist/testing.d.ts.map +1 -0
  286. package/dist/testing.js +170 -0
  287. package/dist/version.d.ts +2 -0
  288. package/dist/version.d.ts.map +1 -0
  289. package/dist/version.js +4 -0
  290. package/package.json +43 -0
@@ -0,0 +1,906 @@
1
+ // Per-file extraction (Epic #189, Issue #194). Given a discovered file, a parser registry,
2
+ // and an open KnowledgeStore, this module:
3
+ //
4
+ // 1. Resolves the file's realPath through the WorkspaceFs port and re-asserts the
5
+ // realpath-containment gate (defence in depth — the walker already filtered, but a
6
+ // consumer calling extractDocument() directly must not bypass the boundary).
7
+ // 2. Reads bytes via WorkspaceFs.readFileBytes (the boundary-checked byte-read path).
8
+ // 3. Computes the content hash (SHA-256 hex) over the raw bytes.
9
+ // 4. Detects the incremental fast-path: if a documents row with the same id already has
10
+ // this content_hash AND status="extracted"/"unsupported", we skip the parse entirely
11
+ // and leave last_extracted_at untouched.
12
+ // 5. Resolves a parser through the registry; rejects an oversized file BEFORE we hand it
13
+ // to the parser (the OVERSIZED_FILE diagnostic is the same code parsers emit).
14
+ // 6. Inside a single transaction: REPLACEs the documents row, deletes prior dependent
15
+ // rows, then inserts the new pages/sections/parsed_units/diagnostics.
16
+ import { createHash } from "node:crypto";
17
+ import { DEFAULT_EXTRACTION_CAPABILITY_AVAILABILITY, DEFAULT_LARGE_DOCUMENT_RESOURCE_POLICY, isSafeScopePath, } from "@oscharko-dev/keiko-contracts";
18
+ import { isDenied } from "@oscharko-dev/keiko-workspace";
19
+ import { buildParserOptions, createProgressivePdfExtractor, classifyLargeDocument, isLegacyBinaryOfficeFormat, legacyFormatDiagnostic, unsupportedParser, usesProgressivePath, } from "../parsers/index.js";
20
+ import { DEFAULT_CHUNKING_STRATEGY_KEY } from "../chunking/types.js";
21
+ import { extractDocumentProgressive, selectProgressiveExtractor, } from "./extract-progressive.js";
22
+ import { redactDiagnosticMessage } from "../privacy/diagnostic-redactor.js";
23
+ import { basenameOf, extensionOf, mediaTypeFor } from "./media-type.js";
24
+ import { compileGlobList, matchesAny } from "./glob.js";
25
+ import { deleteDependentRows, insertDiagnosticRow, insertDocumentRow, insertDocumentTextRow, insertPageRow, insertParsedUnitRow, insertSectionRow, readExistingDocumentRow, } from "./persist.js";
26
+ import { documentIdFor, } from "./types.js";
27
+ // ─── Path helpers (re-derived to keep extract.ts self-contained for the realpath gate) ──
28
+ // On Windows, WorkspaceFs.realPath() may return backslash-separated paths
29
+ // (e.g. C:\Users\workspace\file). Normalise both sides to forward slashes so
30
+ // containment checks work cross-platform.
31
+ function normaliseSep(p) {
32
+ return p.replace(/\\/g, "/");
33
+ }
34
+ function isContained(absoluteRoot, absolutePath) {
35
+ const normRoot = normaliseSep(absoluteRoot);
36
+ const normPath = normaliseSep(absolutePath);
37
+ if (normPath === normRoot)
38
+ return true;
39
+ const prefix = normRoot.endsWith("/") ? normRoot : `${normRoot}/`;
40
+ return normPath.startsWith(prefix);
41
+ }
42
+ function joinAbs(root, rel) {
43
+ if (root.endsWith("/"))
44
+ return `${root}${rel}`;
45
+ return `${root}/${rel}`;
46
+ }
47
+ function toPosixRelative(absoluteRoot, absolutePath) {
48
+ const normRoot = normaliseSep(absoluteRoot);
49
+ const normPath = normaliseSep(absolutePath);
50
+ if (normPath === normRoot)
51
+ return "";
52
+ const prefix = normRoot.endsWith("/") ? normRoot : `${normRoot}/`;
53
+ return normPath.startsWith(prefix) ? normPath.slice(prefix.length) : normPath;
54
+ }
55
+ function scopeRoot(source) {
56
+ const { scope } = source;
57
+ if (scope.kind === "folder")
58
+ return scope.rootPath;
59
+ if (scope.kind === "repository")
60
+ return scope.repositoryRoot;
61
+ return scope.rootPath;
62
+ }
63
+ function hashBytes(bytes) {
64
+ return createHash("sha256").update(bytes).digest("hex");
65
+ }
66
+ function safeDisplay(relativePath) {
67
+ const base = basenameOf(relativePath);
68
+ return base.length === 0 ? relativePath : base;
69
+ }
70
+ function safeRelativePath(relativePath) {
71
+ const normalised = normaliseSep(relativePath);
72
+ if (normalised.startsWith("/")) {
73
+ return {
74
+ code: "INVALID_SCOPE",
75
+ message: "file path failed the selected-scope policy",
76
+ relativePath,
77
+ };
78
+ }
79
+ if (!isSafeScopePath(normalised)) {
80
+ return {
81
+ code: "INVALID_SCOPE",
82
+ message: "file path failed the selected-scope policy",
83
+ relativePath,
84
+ };
85
+ }
86
+ return normalised;
87
+ }
88
+ function redactionPrefixFor(source) {
89
+ return scopeRoot(source);
90
+ }
91
+ function redactMessage(message, source) {
92
+ return redactDiagnosticMessage(message, redactionPrefixFor(source));
93
+ }
94
+ function redactDiagnostic(diagnostic, source) {
95
+ return {
96
+ ...diagnostic,
97
+ message: redactMessage(diagnostic.message, source),
98
+ };
99
+ }
100
+ function redactDiagnostics(diagnostics, source) {
101
+ return diagnostics.map((diagnostic) => redactDiagnostic(diagnostic, source));
102
+ }
103
+ function redactParserResult(parserResult, source) {
104
+ return {
105
+ ...parserResult,
106
+ diagnostics: redactDiagnostics(parserResult.diagnostics, source),
107
+ };
108
+ }
109
+ // ─── Failure / unsupported helpers ───────────────────────────────────────────
110
+ function persistFailureRow(deps, params, documentId, document, diagnostic, now) {
111
+ const db = deps.store._internal.db;
112
+ db.exec("BEGIN");
113
+ try {
114
+ insertDocumentRow(db, {
115
+ id: documentId,
116
+ capsuleId: params.capsuleId,
117
+ sourceId: String(params.source.id),
118
+ documentPath: document.documentPath,
119
+ sizeBytes: document.sizeBytes,
120
+ mediaType: document.mediaType,
121
+ contentHash: document.contentHash,
122
+ parserId: document.parser.parserId,
123
+ parserVersion: document.parser.parserVersion,
124
+ lastExtractedAt: document.lastExtractedAt,
125
+ status: document.status,
126
+ safeDisplayName: document.safeDisplayName,
127
+ });
128
+ deleteDependentRows(db, params.capsuleId, documentId);
129
+ insertDiagnosticRow(db, {
130
+ id: `${String(documentId)}#d0`,
131
+ capsuleId: params.capsuleId,
132
+ diagnostic,
133
+ createdAt: now(),
134
+ });
135
+ db.exec("COMMIT");
136
+ }
137
+ catch (cause) {
138
+ db.exec("ROLLBACK");
139
+ throw cause;
140
+ }
141
+ }
142
+ // GRD-010: decide whether to persist a cascade-deleting failure row. A TRANSIENT IO failure
143
+ // (READ_FAILED / STAT_FAILED) on an incremental refresh must NOT destroy a previously-good index
144
+ // — persisting overwrites the document row and CASCADE-deletes its chunks+vectors. When a prior
145
+ // NON-failed row exists, skip persistence so a momentary lock / NFS hiccup / permission flap
146
+ // preserves retrievable content (the orchestrator then reports a non-destructive skip). Permanent
147
+ // failures (MALFORMED_INPUT, PARSER_FAILED, OVERSIZED_FILE, …) and first-time failures still persist.
148
+ function shouldPersistFailureRow(deps, params, documentId, error, optionPersist) {
149
+ if (!optionPersist)
150
+ return false;
151
+ const isTransient = error.code === "READ_FAILED" || error.code === "STAT_FAILED";
152
+ if (!isTransient)
153
+ return true;
154
+ const existing = readExistingDocumentRow(deps.store._internal.db, params.capsuleId, documentId);
155
+ return existing === undefined || existing.status === "failed";
156
+ }
157
+ function buildFailureResult(deps, params, documentId, error, options = { persist: true }) {
158
+ const now = deps.store._internal.now;
159
+ const redactedMessage = redactMessage(error.message, params.source);
160
+ const diagnostic = {
161
+ severity: "error",
162
+ code: error.code,
163
+ message: redactedMessage,
164
+ documentId,
165
+ };
166
+ const document = {
167
+ id: documentId,
168
+ capsuleId: params.capsuleId,
169
+ sourceId: params.source.id,
170
+ documentPath: params.file.relativePath,
171
+ sizeBytes: params.file.sizeBytes,
172
+ mediaType: mediaTypeFor(extensionOf(params.file.relativePath)),
173
+ contentHash: "",
174
+ parser: { parserId: "none", parserVersion: "0" },
175
+ lastExtractedAt: now(),
176
+ status: "failed",
177
+ safeDisplayName: safeDisplay(params.file.relativePath),
178
+ };
179
+ if (shouldPersistFailureRow(deps, params, documentId, error, options.persist)) {
180
+ persistFailureRow(deps, params, documentId, document, diagnostic, now);
181
+ }
182
+ const outcome = {
183
+ kind: "failed",
184
+ document,
185
+ error: { ...error, message: redactedMessage, relativePath: params.file.relativePath },
186
+ };
187
+ return {
188
+ capsuleId: params.capsuleId,
189
+ sourceId: params.source.id,
190
+ relativePath: params.file.relativePath,
191
+ outcome,
192
+ diagnostics: [diagnostic],
193
+ };
194
+ }
195
+ // ─── Persist helpers (run inside the per-file transaction) ───────────────────
196
+ function persistDependentRows(deps, capsuleId, documentId, parserResult, now) {
197
+ const db = deps.store._internal.db;
198
+ deleteDependentRows(db, capsuleId, documentId);
199
+ if (parserResult.normalizedText !== undefined) {
200
+ insertDocumentTextRow(db, deps.store._internal.contentCipher, capsuleId, documentId, parserResult.normalizedText);
201
+ }
202
+ for (const page of parserResult.pages)
203
+ insertPageRow(db, capsuleId, page);
204
+ for (const section of parserResult.sections) {
205
+ insertSectionRow(db, deps.store._internal.contentCipher, capsuleId, section);
206
+ }
207
+ parserResult.units.forEach((unit, index) => {
208
+ insertParsedUnitRow(db, deps.store._internal.contentCipher, capsuleId, `${String(documentId)}#u${String(index)}`, unit);
209
+ });
210
+ parserResult.diagnostics.forEach((diagnostic, index) => {
211
+ insertDiagnosticRow(db, {
212
+ id: `${String(documentId)}#d${String(index)}`,
213
+ capsuleId,
214
+ diagnostic,
215
+ createdAt: now(),
216
+ });
217
+ });
218
+ }
219
+ function buildDocumentRecord(input) {
220
+ return {
221
+ id: input.documentId,
222
+ capsuleId: input.params.capsuleId,
223
+ sourceId: input.params.source.id,
224
+ documentPath: input.params.file.relativePath,
225
+ sizeBytes: input.params.file.sizeBytes,
226
+ mediaType: input.mediaType,
227
+ contentHash: input.contentHash,
228
+ parser: input.parserResult.parser,
229
+ lastExtractedAt: input.parserResult.extractedAt,
230
+ status: input.status,
231
+ safeDisplayName: safeDisplay(input.params.file.relativePath),
232
+ };
233
+ }
234
+ function persistDocumentAndDependents(deps, params, documentId, document, parserResult, now) {
235
+ const db = deps.store._internal.db;
236
+ db.exec("BEGIN");
237
+ try {
238
+ insertDocumentRow(db, {
239
+ id: documentId,
240
+ capsuleId: params.capsuleId,
241
+ sourceId: String(params.source.id),
242
+ documentPath: document.documentPath,
243
+ sizeBytes: document.sizeBytes,
244
+ mediaType: document.mediaType,
245
+ contentHash: document.contentHash,
246
+ parserId: document.parser.parserId,
247
+ parserVersion: document.parser.parserVersion,
248
+ lastExtractedAt: document.lastExtractedAt,
249
+ status: document.status,
250
+ safeDisplayName: document.safeDisplayName,
251
+ });
252
+ persistDependentRows(deps, params.capsuleId, documentId, parserResult, now);
253
+ db.exec("COMMIT");
254
+ }
255
+ catch (cause) {
256
+ db.exec("ROLLBACK");
257
+ throw cause;
258
+ }
259
+ }
260
+ const HIDDEN_OR_GENERATED_DIRS = new Set([
261
+ ".git",
262
+ ".hg",
263
+ ".svn",
264
+ ".next",
265
+ ".turbo",
266
+ "node_modules",
267
+ "dist",
268
+ "build",
269
+ "coverage",
270
+ "out",
271
+ ]);
272
+ function deriveScopePolicy(scope) {
273
+ if (scope.kind === "folder") {
274
+ if (!isSafeScopePath(scope.rootPath)) {
275
+ return { code: "INVALID_SCOPE", message: "scope.rootPath failed the safe-path gate" };
276
+ }
277
+ return {
278
+ rootPath: scope.rootPath,
279
+ recursive: scope.recursive,
280
+ includeGlobs: compileGlobList(scope.includeGlobs),
281
+ excludeGlobs: compileGlobList(scope.excludeGlobs),
282
+ };
283
+ }
284
+ if (scope.kind === "repository") {
285
+ if (!isSafeScopePath(scope.repositoryRoot)) {
286
+ return { code: "INVALID_SCOPE", message: "scope.repositoryRoot failed the safe-path gate" };
287
+ }
288
+ return {
289
+ rootPath: scope.repositoryRoot,
290
+ recursive: true,
291
+ includeGlobs: compileGlobList(scope.includeGlobs),
292
+ excludeGlobs: compileGlobList(scope.excludeGlobs),
293
+ };
294
+ }
295
+ if (!isSafeScopePath(scope.rootPath)) {
296
+ return { code: "INVALID_SCOPE", message: "scope.rootPath failed the safe-path gate" };
297
+ }
298
+ const explicitFiles = new Set();
299
+ for (const entry of scope.files) {
300
+ const safeEntry = safeRelativePath(entry);
301
+ if (typeof safeEntry !== "string") {
302
+ return {
303
+ code: "INVALID_SCOPE",
304
+ message: `scope.files entry failed the safe-path gate: ${entry}`,
305
+ };
306
+ }
307
+ explicitFiles.add(safeEntry);
308
+ }
309
+ return {
310
+ rootPath: scope.rootPath,
311
+ recursive: false,
312
+ includeGlobs: [],
313
+ excludeGlobs: [],
314
+ explicitFiles,
315
+ };
316
+ }
317
+ function matchesSourceGlobs(policy, relativePath) {
318
+ if (matchesAny(policy.excludeGlobs, relativePath, false))
319
+ return false;
320
+ return matchesAny(policy.includeGlobs, relativePath, true);
321
+ }
322
+ function hasHiddenOrGeneratedParent(relativePath) {
323
+ const segments = relativePath.split("/").filter((segment) => segment.length > 0);
324
+ for (const segment of segments.slice(0, -1)) {
325
+ if (segment.startsWith(".") || HIDDEN_OR_GENERATED_DIRS.has(segment))
326
+ return true;
327
+ }
328
+ return false;
329
+ }
330
+ function isSelectedByScope(policy, relativePath) {
331
+ if (isDenied(relativePath))
332
+ return false;
333
+ if (policy.explicitFiles !== undefined)
334
+ return policy.explicitFiles.has(relativePath);
335
+ if (!policy.recursive && relativePath.includes("/"))
336
+ return false;
337
+ if (hasHiddenOrGeneratedParent(relativePath))
338
+ return false;
339
+ return matchesSourceGlobs(policy, relativePath);
340
+ }
341
+ function targetError(error, persistFailure) {
342
+ return { error, persistFailure };
343
+ }
344
+ function selectedRelativePath(policy, rawRelativePath) {
345
+ const relativePath = safeRelativePath(rawRelativePath);
346
+ if (typeof relativePath !== "string") {
347
+ return targetError(relativePath, false);
348
+ }
349
+ if (!isSelectedByScope(policy, relativePath)) {
350
+ return targetError({
351
+ code: "INVALID_SCOPE",
352
+ message: "file is outside the selected source scope",
353
+ relativePath,
354
+ }, false);
355
+ }
356
+ return relativePath;
357
+ }
358
+ function resolveRealPathTarget(deps, path, relativePath, message) {
359
+ try {
360
+ return deps.fs.realPath(path);
361
+ }
362
+ catch {
363
+ return targetError({ code: "READ_FAILED", message, relativePath }, true);
364
+ }
365
+ }
366
+ function containedRealFileTarget(realRoot, real, relativePath) {
367
+ if (!isContained(realRoot, real)) {
368
+ return targetError({
369
+ code: "PATH_ESCAPE",
370
+ message: `realpath escapes scope root: ${relativePath}`,
371
+ relativePath,
372
+ }, true);
373
+ }
374
+ const realRelativePath = toPosixRelative(realRoot, real);
375
+ if (isDenied(realRelativePath)) {
376
+ return targetError({
377
+ code: "READ_FAILED",
378
+ message: "resolved file is denied by workspace policy",
379
+ relativePath,
380
+ }, true);
381
+ }
382
+ return undefined;
383
+ }
384
+ function resolveTargetPath(deps, params) {
385
+ const policy = deriveScopePolicy(params.source.scope);
386
+ if ("code" in policy) {
387
+ return targetError(policy, false);
388
+ }
389
+ const relativePath = selectedRelativePath(policy, params.file.relativePath);
390
+ if (typeof relativePath !== "string") {
391
+ return relativePath;
392
+ }
393
+ const root = policy.rootPath;
394
+ const absolute = joinAbs(root, relativePath);
395
+ const realRoot = resolveRealPathTarget(deps, root, relativePath, "realPath failed for selected source root");
396
+ if (typeof realRoot !== "string")
397
+ return realRoot;
398
+ const real = resolveRealPathTarget(deps, absolute, relativePath, "realPath failed for selected file");
399
+ if (typeof real !== "string")
400
+ return real;
401
+ const containmentError = containedRealFileTarget(realRoot, real, relativePath);
402
+ if (containmentError !== undefined)
403
+ return containmentError;
404
+ // Normalise to forward slashes so subsequent IO calls (readFileBytes, stat) receive
405
+ // a consistent path even when realPath returned a Windows backslash path.
406
+ return { absolutePath: normaliseSep(real), requestedAbsolutePath: absolute, relativePath };
407
+ }
408
+ function validateRequestedTarget(deps, params, target) {
409
+ try {
410
+ deps.fs.stat(target.requestedAbsolutePath);
411
+ return undefined;
412
+ }
413
+ catch {
414
+ return {
415
+ code: "STAT_FAILED",
416
+ message: "stat failed for selected file",
417
+ relativePath: params.file.relativePath,
418
+ };
419
+ }
420
+ }
421
+ function validateResolvedTarget(deps, params, target) {
422
+ try {
423
+ const realStat = deps.fs.stat(target.absolutePath);
424
+ if (!realStat.isFile) {
425
+ return {
426
+ code: "READ_FAILED",
427
+ message: "selected path is not a file",
428
+ relativePath: params.file.relativePath,
429
+ };
430
+ }
431
+ if (realStat.hardLinkCount === undefined || realStat.hardLinkCount <= 1)
432
+ return undefined;
433
+ return {
434
+ code: "READ_FAILED",
435
+ message: "selected file is not eligible for extraction",
436
+ relativePath: params.file.relativePath,
437
+ };
438
+ }
439
+ catch {
440
+ return {
441
+ code: "STAT_FAILED",
442
+ message: "stat failed for selected file",
443
+ relativePath: params.file.relativePath,
444
+ };
445
+ }
446
+ }
447
+ async function readBytes(deps, params, target, maxBytes) {
448
+ const reader = deps.fs.readFileBytes;
449
+ if (reader === undefined) {
450
+ return {
451
+ code: "READ_FAILED",
452
+ message: "WorkspaceFs.readFileBytes is unavailable",
453
+ relativePath: params.file.relativePath,
454
+ };
455
+ }
456
+ const requestedError = validateRequestedTarget(deps, params, target);
457
+ if (requestedError !== undefined)
458
+ return requestedError;
459
+ const resolvedError = validateResolvedTarget(deps, params, target);
460
+ if (resolvedError !== undefined)
461
+ return resolvedError;
462
+ try {
463
+ return await reader(target.absolutePath, maxBytes);
464
+ }
465
+ catch {
466
+ return {
467
+ code: "READ_FAILED",
468
+ message: "readFileBytes failed for selected file",
469
+ relativePath: params.file.relativePath,
470
+ };
471
+ }
472
+ }
473
+ // ─── Incremental fast-path ───────────────────────────────────────────────────
474
+ function readUnchangedFastPath(deps, params, documentId, contentHash) {
475
+ const existing = readExistingDocumentRow(deps.store._internal.db, params.capsuleId, documentId);
476
+ if (existing === undefined)
477
+ return undefined;
478
+ if (existing.content_hash !== contentHash)
479
+ return undefined;
480
+ // Skip only terminal-good states. A `pending` row is an interrupted progressive extraction
481
+ // (Issue #1286) and `failed` should be retried; both must re-extract rather than be skipped.
482
+ if (existing.status !== "extracted" && existing.status !== "unsupported")
483
+ return undefined;
484
+ const document = {
485
+ id: documentId,
486
+ capsuleId: params.capsuleId,
487
+ sourceId: params.source.id,
488
+ documentPath: existing.document_path,
489
+ sizeBytes: existing.size_bytes,
490
+ mediaType: existing.media_type,
491
+ contentHash: existing.content_hash,
492
+ parser: {
493
+ parserId: existing.parser_id,
494
+ parserVersion: existing.parser_version,
495
+ },
496
+ lastExtractedAt: existing.last_extracted_at,
497
+ status: existing.status,
498
+ safeDisplayName: existing.safe_display_name,
499
+ };
500
+ return {
501
+ capsuleId: params.capsuleId,
502
+ sourceId: params.source.id,
503
+ relativePath: params.file.relativePath,
504
+ outcome: { kind: "skipped", document, reason: "unchanged" },
505
+ diagnostics: [],
506
+ };
507
+ }
508
+ // ─── Large-document routing (Epic #1160, Issue #1286) ────────────────────────
509
+ let defaultProgressiveExtractorsCache;
510
+ function defaultProgressiveExtractors() {
511
+ defaultProgressiveExtractorsCache ??= [createProgressivePdfExtractor()];
512
+ return defaultProgressiveExtractorsCache;
513
+ }
514
+ function largeDocumentContextFor(deps, resolved, options) {
515
+ return {
516
+ policy: deps.largeDocumentPolicy ?? DEFAULT_LARGE_DOCUMENT_RESOURCE_POLICY,
517
+ capabilities: deps.extractionCapabilities ?? DEFAULT_EXTRACTION_CAPABILITY_AVAILABILITY,
518
+ extractors: deps.progressiveExtractors ?? defaultProgressiveExtractors(),
519
+ jobId: deps.largeDocumentJobId ?? "extract",
520
+ chunkingStrategyVersion: deps.chunkingStrategyVersion ?? DEFAULT_CHUNKING_STRATEGY_KEY,
521
+ absolutePath: resolved.absolutePath,
522
+ relativePath: resolved.relativePath,
523
+ ...(options.signal === undefined ? {} : { signal: options.signal }),
524
+ };
525
+ }
526
+ const PROGRESSIVE_HASH_CHUNK_BYTES = 4 * 1024 * 1024;
527
+ async function readRange(deps, params, target, startByte, length) {
528
+ const reader = deps.fs.readFileRange;
529
+ if (reader === undefined) {
530
+ return {
531
+ code: "READ_FAILED",
532
+ message: "WorkspaceFs.readFileRange is unavailable",
533
+ relativePath: params.file.relativePath,
534
+ };
535
+ }
536
+ const requestedError = validateRequestedTarget(deps, params, target);
537
+ if (requestedError !== undefined)
538
+ return requestedError;
539
+ const resolvedError = validateResolvedTarget(deps, params, target);
540
+ if (resolvedError !== undefined)
541
+ return resolvedError;
542
+ try {
543
+ return await reader(target.absolutePath, startByte, length);
544
+ }
545
+ catch {
546
+ return {
547
+ code: "READ_FAILED",
548
+ message: "readFileRange failed for selected file",
549
+ relativePath: params.file.relativePath,
550
+ };
551
+ }
552
+ }
553
+ function progressiveRangeSource(deps, params, target) {
554
+ if (deps.fs.readFileRange === undefined) {
555
+ return {
556
+ code: "READ_FAILED",
557
+ message: "WorkspaceFs.readFileRange is unavailable",
558
+ relativePath: params.file.relativePath,
559
+ };
560
+ }
561
+ return {
562
+ totalBytes: params.file.sizeBytes,
563
+ readWindow: async (startByte, length) => {
564
+ const bytes = await readRange(deps, params, target, startByte, length);
565
+ if (bytes instanceof Uint8Array)
566
+ return bytes;
567
+ throw new Error(bytes.message);
568
+ },
569
+ };
570
+ }
571
+ async function hashProgressiveSource(source) {
572
+ if (source.readWindow === undefined) {
573
+ throw new Error("progressive source does not support bounded range reads");
574
+ }
575
+ const hash = createHash("sha256");
576
+ for (let offset = 0; offset < source.totalBytes; offset += PROGRESSIVE_HASH_CHUNK_BYTES) {
577
+ const bytes = await source.readWindow(offset, Math.min(PROGRESSIVE_HASH_CHUNK_BYTES, source.totalBytes - offset));
578
+ if (bytes.byteLength === 0)
579
+ break;
580
+ hash.update(bytes);
581
+ }
582
+ return hash.digest("hex");
583
+ }
584
+ // Legacy binary office formats (.doc/.ppt/.xls) get the existing unsupported path plus a stable
585
+ // CONVERTER_UNAVAILABLE diagnostic with actionable guidance, leaving the job stable.
586
+ function appendLegacyDiagnostic(deps, params, documentId, extension, result) {
587
+ const diagnostic = legacyFormatDiagnostic(extension, documentId);
588
+ if (diagnostic === undefined)
589
+ return result;
590
+ insertDiagnosticRow(deps.store._internal.db, {
591
+ id: `${String(documentId)}#legacy`,
592
+ capsuleId: params.capsuleId,
593
+ diagnostic,
594
+ createdAt: deps.store._internal.now(),
595
+ });
596
+ return { ...result, diagnostics: [...result.diagnostics, diagnostic] };
597
+ }
598
+ // ─── Top-level entry point ───────────────────────────────────────────────────
599
+ function selectionInput(documentId, relativePath, bytes) {
600
+ const extension = extensionOf(relativePath);
601
+ return {
602
+ documentId,
603
+ bytes,
604
+ extension,
605
+ mediaType: mediaTypeFor(extension),
606
+ };
607
+ }
608
+ function isUnsupportedResult(result) {
609
+ return (result.parser.parserId === "unsupported" ||
610
+ (result.units.length > 0 && result.units.every((unit) => unit.kind === "unsupported-media")));
611
+ }
612
+ const FAILED_PARSER_DIAGNOSTIC_CODES = new Set([
613
+ "OVERSIZED_FILE",
614
+ "PARSER_TIMEOUT",
615
+ "PARSER_CANCELLED",
616
+ "MALFORMED_INPUT",
617
+ "OBJECT_LIMIT_REACHED",
618
+ ]);
619
+ function firstParserFailureDiagnostic(result) {
620
+ return result.diagnostics.find((diagnostic) => diagnostic.severity === "error" || FAILED_PARSER_DIAGNOSTIC_CODES.has(diagnostic.code));
621
+ }
622
+ function statusForResult(result) {
623
+ if (isUnsupportedResult(result))
624
+ return "unsupported";
625
+ if (firstParserFailureDiagnostic(result) !== undefined)
626
+ return "failed";
627
+ return "extracted";
628
+ }
629
+ function discoveryErrorCodeForParserDiagnostic(diagnostic) {
630
+ if (diagnostic.code === "OVERSIZED_FILE")
631
+ return "OVERSIZED_FILE";
632
+ if (diagnostic.code === "PARSER_CANCELLED")
633
+ return "CANCELLED";
634
+ if (diagnostic.code === "MALFORMED_INPUT")
635
+ return "MALFORMED_INPUT";
636
+ if (diagnostic.code === "PARSER_TIMEOUT")
637
+ return "PARSER_TIMEOUT";
638
+ return "PARSER_FAILED";
639
+ }
640
+ function parserFailureOutcome(document, diagnostic, relativePath) {
641
+ return {
642
+ kind: "failed",
643
+ document,
644
+ error: {
645
+ code: discoveryErrorCodeForParserDiagnostic(diagnostic),
646
+ message: diagnostic.message,
647
+ relativePath,
648
+ },
649
+ };
650
+ }
651
+ const SOURCE_TEXT_PARSER_IDS = new Set(["text", "json", "csv", "html"]);
652
+ function decodeUtf8ForStorage(bytes) {
653
+ const raw = new TextDecoder("utf-8", { fatal: false }).decode(bytes);
654
+ return raw.length > 0 && raw.charCodeAt(0) === 0xfeff ? raw.slice(1) : raw;
655
+ }
656
+ function normalizedTextForPersistence(parserResult, bytes) {
657
+ if (parserResult.normalizedText !== undefined) {
658
+ return parserResult.normalizedText;
659
+ }
660
+ if (!SOURCE_TEXT_PARSER_IDS.has(parserResult.parser.parserId)) {
661
+ return undefined;
662
+ }
663
+ if (parserResult.units.length === 0) {
664
+ return undefined;
665
+ }
666
+ return decodeUtf8ForStorage(bytes);
667
+ }
668
+ function withPersistedNormalizedText(parserResult, bytes) {
669
+ const normalizedText = normalizedTextForPersistence(parserResult, bytes);
670
+ return normalizedText === undefined ? parserResult : { ...parserResult, normalizedText };
671
+ }
672
+ function hasAsyncParse(adapter) {
673
+ return typeof adapter.parseAsync === "function";
674
+ }
675
+ async function runParser(deps, documentId, params, bytes, options) {
676
+ const input = selectionInput(documentId, params.file.relativePath, bytes);
677
+ const resolution = deps.parserRegistry.resolve(input);
678
+ const adapter = resolution.kind === "matched" ? resolution.adapter : unsupportedParser;
679
+ if (hasAsyncParse(adapter)) {
680
+ return adapter.parseAsync(input, options);
681
+ }
682
+ return adapter.parse(input, options);
683
+ }
684
+ async function runParserForPersistence(deps, documentId, params, bytes, options) {
685
+ const result = await runParser(deps, documentId, params, bytes, options);
686
+ return withPersistedNormalizedText(result, bytes);
687
+ }
688
+ function persistExtractedDocument(deps, params, documentId, document, parserResult) {
689
+ persistDocumentAndDependents(deps, params, documentId, document, parserResult, deps.store._internal.now);
690
+ }
691
+ async function readBoundedDocumentBytes(deps, params, documentId, target, options) {
692
+ const bytes = await readBytes(deps, params, target, options.maxBytes + 1);
693
+ if (!(bytes instanceof Uint8Array)) {
694
+ return buildFailureResult(deps, params, documentId, bytes);
695
+ }
696
+ if (bytes.byteLength > options.maxBytes) {
697
+ return buildOversizedFailure(deps, params, documentId, options, bytes.byteLength);
698
+ }
699
+ return bytes;
700
+ }
701
+ function parserExtractionResult(params, document, parserResult, status) {
702
+ const failureDiagnostic = firstParserFailureDiagnostic(parserResult);
703
+ return {
704
+ capsuleId: params.capsuleId,
705
+ sourceId: params.source.id,
706
+ relativePath: params.file.relativePath,
707
+ outcome: status === "failed" && failureDiagnostic !== undefined
708
+ ? parserFailureOutcome(document, failureDiagnostic, params.file.relativePath)
709
+ : { kind: "persisted", document },
710
+ diagnostics: parserResult.diagnostics,
711
+ };
712
+ }
713
+ function paramsWithRelativePath(params, relativePath) {
714
+ if (params.file.relativePath === relativePath)
715
+ return params;
716
+ return { ...params, file: { ...params.file, relativePath } };
717
+ }
718
+ function extractionDocumentId(params) {
719
+ return documentIdFor({
720
+ capsuleId: params.capsuleId,
721
+ sourceId: params.source.id,
722
+ relativePath: params.file.relativePath,
723
+ });
724
+ }
725
+ function targetResolutionFailure(deps, params, resolved) {
726
+ const failureParams = resolved.error.relativePath === undefined
727
+ ? params
728
+ : paramsWithRelativePath(params, resolved.error.relativePath);
729
+ return buildFailureResult(deps, failureParams, extractionDocumentId(failureParams), resolved.error, {
730
+ persist: resolved.persistFailure,
731
+ });
732
+ }
733
+ async function parseAndPersistDocument(deps, params, documentId, bytes, contentHash, options) {
734
+ let parserResult;
735
+ try {
736
+ parserResult = await runParserForPersistence(deps, documentId, params, bytes, options);
737
+ }
738
+ catch {
739
+ return buildFailureResult(deps, params, documentId, {
740
+ code: "PARSER_FAILED",
741
+ message: "parser adapter failed while extracting document",
742
+ relativePath: params.file.relativePath,
743
+ });
744
+ }
745
+ const redactedParserResult = redactParserResult(parserResult, params.source);
746
+ const status = statusForResult(redactedParserResult);
747
+ const document = buildDocumentRecord({
748
+ documentId,
749
+ params,
750
+ mediaType: mediaTypeFor(extensionOf(params.file.relativePath)),
751
+ contentHash,
752
+ parserResult: redactedParserResult,
753
+ status,
754
+ });
755
+ persistExtractedDocument(deps, params, documentId, document, redactedParserResult);
756
+ return parserExtractionResult(params, document, redactedParserResult, status);
757
+ }
758
+ async function progressiveExtractionResult(deps, params, resolved, documentId, options, extension, mediaType) {
759
+ const context = largeDocumentContextFor(deps, resolved, options);
760
+ const preflight = classifyLargeDocument({ extension, mediaType, sizeBytes: params.file.sizeBytes }, context.policy);
761
+ if (preflight.decision === "reject-oversized") {
762
+ return buildOversizedFailure(deps, params, documentId, {
763
+ ...options,
764
+ maxBytes: context.policy.maxRawFileBytes,
765
+ });
766
+ }
767
+ const extractor = params.file.sizeBytes >= context.policy.largeFileThresholdBytes
768
+ ? selectProgressiveExtractor(context, extension, mediaType)
769
+ : undefined;
770
+ if (!usesProgressivePath(preflight) && extractor === undefined)
771
+ return undefined;
772
+ if (extractor === undefined)
773
+ return undefined;
774
+ const source = progressiveRangeSource(deps, params, resolved);
775
+ if (!("totalBytes" in source))
776
+ return buildFailureResult(deps, params, documentId, source);
777
+ let contentHash;
778
+ try {
779
+ contentHash = await hashProgressiveSource(source);
780
+ }
781
+ catch {
782
+ return buildFailureResult(deps, params, documentId, {
783
+ code: "READ_FAILED",
784
+ message: "readFileRange failed for selected file",
785
+ relativePath: params.file.relativePath,
786
+ });
787
+ }
788
+ const fast = readUnchangedFastPath(deps, params, documentId, contentHash);
789
+ return (fast ??
790
+ (await extractDocumentProgressive(deps, params, context, source, contentHash, extractor)));
791
+ }
792
+ async function standardExtractionResult(deps, params, resolved, documentId, options, extension) {
793
+ if (params.file.sizeBytes > options.maxBytes) {
794
+ return buildOversizedFailure(deps, params, documentId, options);
795
+ }
796
+ const bytes = await readBoundedDocumentBytes(deps, params, documentId, resolved, options);
797
+ if (!(bytes instanceof Uint8Array))
798
+ return bytes;
799
+ const contentHash = hashBytes(bytes);
800
+ const fast = readUnchangedFastPath(deps, params, documentId, contentHash);
801
+ if (fast !== undefined)
802
+ return fast;
803
+ const result = await parseAndPersistDocument(deps, params, documentId, bytes, contentHash, options);
804
+ return isLegacyBinaryOfficeFormat(extension)
805
+ ? appendLegacyDiagnostic(deps, params, documentId, extension, result)
806
+ : result;
807
+ }
808
+ export async function extractDocument(deps, params) {
809
+ const resolved = resolveTargetPath(deps, params);
810
+ if ("error" in resolved) {
811
+ return targetResolutionFailure(deps, params, resolved);
812
+ }
813
+ const canonicalParams = paramsWithRelativePath(params, resolved.relativePath);
814
+ const documentId = extractionDocumentId(canonicalParams);
815
+ const options = canonicalParams.parserOptions ?? buildParserOptions();
816
+ const extension = extensionOf(canonicalParams.file.relativePath);
817
+ const mediaType = mediaTypeFor(extension);
818
+ const progressive = await progressiveExtractionResult(deps, canonicalParams, resolved, documentId, options, extension, mediaType);
819
+ if (progressive !== undefined)
820
+ return progressive;
821
+ return standardExtractionResult(deps, canonicalParams, resolved, documentId, options, extension);
822
+ }
823
+ export function recordExtractionFailure(deps, params) {
824
+ const documentId = documentIdFor({
825
+ capsuleId: params.capsuleId,
826
+ sourceId: params.source.id,
827
+ relativePath: params.file.relativePath,
828
+ });
829
+ return buildFailureResult(deps, params, documentId, params.error);
830
+ }
831
+ function oversizedDocumentRecord(params, documentId, lastExtractedAt, observedSizeBytes) {
832
+ const sizeBytes = observedSizeBytes === undefined
833
+ ? params.file.sizeBytes
834
+ : Math.max(params.file.sizeBytes, observedSizeBytes);
835
+ return {
836
+ id: documentId,
837
+ capsuleId: params.capsuleId,
838
+ sourceId: params.source.id,
839
+ documentPath: params.file.relativePath,
840
+ sizeBytes,
841
+ mediaType: mediaTypeFor(extensionOf(params.file.relativePath)),
842
+ contentHash: "",
843
+ parser: { parserId: "none", parserVersion: "0" },
844
+ lastExtractedAt,
845
+ status: "failed",
846
+ safeDisplayName: safeDisplay(params.file.relativePath),
847
+ };
848
+ }
849
+ function persistOversizedRow(deps, params, documentId, document, diagnostic, now) {
850
+ const db = deps.store._internal.db;
851
+ db.exec("BEGIN");
852
+ try {
853
+ insertDocumentRow(db, {
854
+ id: documentId,
855
+ capsuleId: params.capsuleId,
856
+ sourceId: String(params.source.id),
857
+ documentPath: document.documentPath,
858
+ sizeBytes: document.sizeBytes,
859
+ mediaType: document.mediaType,
860
+ contentHash: document.contentHash,
861
+ parserId: document.parser.parserId,
862
+ parserVersion: document.parser.parserVersion,
863
+ lastExtractedAt: document.lastExtractedAt,
864
+ status: document.status,
865
+ safeDisplayName: document.safeDisplayName,
866
+ });
867
+ deleteDependentRows(db, params.capsuleId, documentId);
868
+ insertDiagnosticRow(db, {
869
+ id: `${String(documentId)}#d0`,
870
+ capsuleId: params.capsuleId,
871
+ diagnostic,
872
+ createdAt: now(),
873
+ });
874
+ db.exec("COMMIT");
875
+ }
876
+ catch (cause) {
877
+ db.exec("ROLLBACK");
878
+ throw cause;
879
+ }
880
+ }
881
+ function buildOversizedFailure(deps, params, documentId, options, observedSizeBytes) {
882
+ const now = deps.store._internal.now;
883
+ const sizeBytes = observedSizeBytes === undefined
884
+ ? params.file.sizeBytes
885
+ : Math.max(params.file.sizeBytes, observedSizeBytes);
886
+ const message = redactMessage(`file size ${String(sizeBytes)} exceeds maxBytes=${String(options.maxBytes)}`, params.source);
887
+ const diagnostic = {
888
+ severity: "error",
889
+ code: "OVERSIZED_FILE",
890
+ message,
891
+ documentId,
892
+ };
893
+ const document = oversizedDocumentRecord(params, documentId, now(), observedSizeBytes);
894
+ persistOversizedRow(deps, params, documentId, document, diagnostic, now);
895
+ return {
896
+ capsuleId: params.capsuleId,
897
+ sourceId: params.source.id,
898
+ relativePath: params.file.relativePath,
899
+ outcome: {
900
+ kind: "failed",
901
+ document,
902
+ error: { code: "OVERSIZED_FILE", message },
903
+ },
904
+ diagnostics: [diagnostic],
905
+ };
906
+ }