@oscharko-dev/keiko-local-knowledge 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/dist/.tsbuildinfo +1 -0
  2. package/dist/bounded-document-extraction.d.ts +27 -0
  3. package/dist/bounded-document-extraction.d.ts.map +1 -0
  4. package/dist/bounded-document-extraction.js +214 -0
  5. package/dist/capsule-lifecycle.d.ts +33 -0
  6. package/dist/capsule-lifecycle.d.ts.map +1 -0
  7. package/dist/capsule-lifecycle.js +292 -0
  8. package/dist/capsule-set-lifecycle.d.ts +15 -0
  9. package/dist/capsule-set-lifecycle.d.ts.map +1 -0
  10. package/dist/capsule-set-lifecycle.js +158 -0
  11. package/dist/chunking/chunker-persist.d.ts +36 -0
  12. package/dist/chunking/chunker-persist.d.ts.map +1 -0
  13. package/dist/chunking/chunker-persist.js +74 -0
  14. package/dist/chunking/chunker-runner.d.ts +9 -0
  15. package/dist/chunking/chunker-runner.d.ts.map +1 -0
  16. package/dist/chunking/chunker-runner.js +218 -0
  17. package/dist/chunking/chunker.d.ts +7 -0
  18. package/dist/chunking/chunker.d.ts.map +1 -0
  19. package/dist/chunking/chunker.js +139 -0
  20. package/dist/chunking/citation-mapper.d.ts +4 -0
  21. package/dist/chunking/citation-mapper.d.ts.map +1 -0
  22. package/dist/chunking/citation-mapper.js +180 -0
  23. package/dist/chunking/index.d.ts +6 -0
  24. package/dist/chunking/index.d.ts.map +1 -0
  25. package/dist/chunking/index.js +8 -0
  26. package/dist/chunking/token-estimator.d.ts +3 -0
  27. package/dist/chunking/token-estimator.d.ts.map +1 -0
  28. package/dist/chunking/token-estimator.js +26 -0
  29. package/dist/chunking/types.d.ts +49 -0
  30. package/dist/chunking/types.d.ts.map +1 -0
  31. package/dist/chunking/types.js +26 -0
  32. package/dist/composition.d.ts +57 -0
  33. package/dist/composition.d.ts.map +1 -0
  34. package/dist/composition.js +310 -0
  35. package/dist/conversation/citation-attacher.d.ts +8 -0
  36. package/dist/conversation/citation-attacher.d.ts.map +1 -0
  37. package/dist/conversation/citation-attacher.js +55 -0
  38. package/dist/conversation/citation-excerpts.d.ts +4 -0
  39. package/dist/conversation/citation-excerpts.d.ts.map +1 -0
  40. package/dist/conversation/citation-excerpts.js +41 -0
  41. package/dist/conversation/grounded-answer-runner.d.ts +9 -0
  42. package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
  43. package/dist/conversation/grounded-answer-runner.js +61 -0
  44. package/dist/conversation/index.d.ts +5 -0
  45. package/dist/conversation/index.d.ts.map +1 -0
  46. package/dist/conversation/index.js +7 -0
  47. package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
  48. package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
  49. package/dist/conversation/model-gateway-answer-generator.js +105 -0
  50. package/dist/conversation/types.d.ts +35 -0
  51. package/dist/conversation/types.d.ts.map +1 -0
  52. package/dist/conversation/types.js +24 -0
  53. package/dist/discovery/discovery-runner.d.ts +23 -0
  54. package/dist/discovery/discovery-runner.d.ts.map +1 -0
  55. package/dist/discovery/discovery-runner.js +109 -0
  56. package/dist/discovery/extract-progressive.d.ts +17 -0
  57. package/dist/discovery/extract-progressive.d.ts.map +1 -0
  58. package/dist/discovery/extract-progressive.js +522 -0
  59. package/dist/discovery/extract.d.ts +26 -0
  60. package/dist/discovery/extract.d.ts.map +1 -0
  61. package/dist/discovery/extract.js +906 -0
  62. package/dist/discovery/glob.d.ts +10 -0
  63. package/dist/discovery/glob.d.ts.map +1 -0
  64. package/dist/discovery/glob.js +72 -0
  65. package/dist/discovery/index.d.ts +6 -0
  66. package/dist/discovery/index.d.ts.map +1 -0
  67. package/dist/discovery/index.js +8 -0
  68. package/dist/discovery/media-type.d.ts +4 -0
  69. package/dist/discovery/media-type.d.ts.map +1 -0
  70. package/dist/discovery/media-type.js +62 -0
  71. package/dist/discovery/persist.d.ts +63 -0
  72. package/dist/discovery/persist.d.ts.map +1 -0
  73. package/dist/discovery/persist.js +345 -0
  74. package/dist/discovery/test-support.d.ts +16 -0
  75. package/dist/discovery/test-support.d.ts.map +1 -0
  76. package/dist/discovery/test-support.js +127 -0
  77. package/dist/discovery/types.d.ts +63 -0
  78. package/dist/discovery/types.d.ts.map +1 -0
  79. package/dist/discovery/types.js +28 -0
  80. package/dist/discovery/walk.d.ts +12 -0
  81. package/dist/discovery/walk.d.ts.map +1 -0
  82. package/dist/discovery/walk.js +302 -0
  83. package/dist/errors.d.ts +13 -0
  84. package/dist/errors.d.ts.map +1 -0
  85. package/dist/errors.js +22 -0
  86. package/dist/evaluations/dimensions.d.ts +14 -0
  87. package/dist/evaluations/dimensions.d.ts.map +1 -0
  88. package/dist/evaluations/dimensions.js +191 -0
  89. package/dist/evaluations/fixtures.d.ts +18 -0
  90. package/dist/evaluations/fixtures.d.ts.map +1 -0
  91. package/dist/evaluations/fixtures.js +858 -0
  92. package/dist/evaluations/index.d.ts +7 -0
  93. package/dist/evaluations/index.d.ts.map +1 -0
  94. package/dist/evaluations/index.js +10 -0
  95. package/dist/evaluations/report.d.ts +3 -0
  96. package/dist/evaluations/report.d.ts.map +1 -0
  97. package/dist/evaluations/report.js +31 -0
  98. package/dist/evaluations/runner-seed.d.ts +12 -0
  99. package/dist/evaluations/runner-seed.d.ts.map +1 -0
  100. package/dist/evaluations/runner-seed.js +175 -0
  101. package/dist/evaluations/runner.d.ts +8 -0
  102. package/dist/evaluations/runner.d.ts.map +1 -0
  103. package/dist/evaluations/runner.js +205 -0
  104. package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
  105. package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
  106. package/dist/evaluations/scripted-embedding-adapter.js +163 -0
  107. package/dist/evaluations/types.d.ts +116 -0
  108. package/dist/evaluations/types.d.ts.map +1 -0
  109. package/dist/evaluations/types.js +27 -0
  110. package/dist/index.d.ts +23 -0
  111. package/dist/index.d.ts.map +1 -0
  112. package/dist/index.js +41 -0
  113. package/dist/indexing/bounded-indexing.d.ts +41 -0
  114. package/dist/indexing/bounded-indexing.d.ts.map +1 -0
  115. package/dist/indexing/bounded-indexing.js +240 -0
  116. package/dist/indexing/checkpoint-persist.d.ts +8 -0
  117. package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
  118. package/dist/indexing/checkpoint-persist.js +135 -0
  119. package/dist/indexing/checkpoint-resume.d.ts +20 -0
  120. package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
  121. package/dist/indexing/checkpoint-resume.js +50 -0
  122. package/dist/indexing/embedding-batcher.d.ts +3 -0
  123. package/dist/indexing/embedding-batcher.d.ts.map +1 -0
  124. package/dist/indexing/embedding-batcher.js +390 -0
  125. package/dist/indexing/index.d.ts +7 -0
  126. package/dist/indexing/index.d.ts.map +1 -0
  127. package/dist/indexing/index.js +11 -0
  128. package/dist/indexing/job-persist.d.ts +46 -0
  129. package/dist/indexing/job-persist.d.ts.map +1 -0
  130. package/dist/indexing/job-persist.js +157 -0
  131. package/dist/indexing/job-resume.d.ts +4 -0
  132. package/dist/indexing/job-resume.d.ts.map +1 -0
  133. package/dist/indexing/job-resume.js +14 -0
  134. package/dist/indexing/orchestrator.d.ts +3 -0
  135. package/dist/indexing/orchestrator.d.ts.map +1 -0
  136. package/dist/indexing/orchestrator.js +1151 -0
  137. package/dist/indexing/types.d.ts +156 -0
  138. package/dist/indexing/types.d.ts.map +1 -0
  139. package/dist/indexing/types.js +30 -0
  140. package/dist/indexing/vector-persist.d.ts +32 -0
  141. package/dist/indexing/vector-persist.d.ts.map +1 -0
  142. package/dist/indexing/vector-persist.js +105 -0
  143. package/dist/parsers/_internal.d.ts +20 -0
  144. package/dist/parsers/_internal.d.ts.map +1 -0
  145. package/dist/parsers/_internal.js +122 -0
  146. package/dist/parsers/csv-parser.d.ts +3 -0
  147. package/dist/parsers/csv-parser.d.ts.map +1 -0
  148. package/dist/parsers/csv-parser.js +202 -0
  149. package/dist/parsers/docx-parser.d.ts +3 -0
  150. package/dist/parsers/docx-parser.d.ts.map +1 -0
  151. package/dist/parsers/docx-parser.js +390 -0
  152. package/dist/parsers/html-parser.d.ts +3 -0
  153. package/dist/parsers/html-parser.d.ts.map +1 -0
  154. package/dist/parsers/html-parser.js +310 -0
  155. package/dist/parsers/index.d.ts +15 -0
  156. package/dist/parsers/index.d.ts.map +1 -0
  157. package/dist/parsers/index.js +41 -0
  158. package/dist/parsers/json-parser.d.ts +3 -0
  159. package/dist/parsers/json-parser.d.ts.map +1 -0
  160. package/dist/parsers/json-parser.js +192 -0
  161. package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
  162. package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
  163. package/dist/parsers/large-document/capability-discovery.js +76 -0
  164. package/dist/parsers/large-document/diagnostics.d.ts +3 -0
  165. package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
  166. package/dist/parsers/large-document/diagnostics.js +11 -0
  167. package/dist/parsers/large-document/index.d.ts +15 -0
  168. package/dist/parsers/large-document/index.d.ts.map +1 -0
  169. package/dist/parsers/large-document/index.js +10 -0
  170. package/dist/parsers/large-document/legacy-format.d.ts +5 -0
  171. package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
  172. package/dist/parsers/large-document/legacy-format.js +25 -0
  173. package/dist/parsers/large-document/preflight.d.ts +9 -0
  174. package/dist/parsers/large-document/preflight.d.ts.map +1 -0
  175. package/dist/parsers/large-document/preflight.js +43 -0
  176. package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
  177. package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
  178. package/dist/parsers/large-document/progressive-extraction.js +123 -0
  179. package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
  180. package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
  181. package/dist/parsers/large-document/progressive-pdf.js +145 -0
  182. package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
  183. package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
  184. package/dist/parsers/large-document/synthetic-source.js +101 -0
  185. package/dist/parsers/large-document/window-builder.d.ts +24 -0
  186. package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
  187. package/dist/parsers/large-document/window-builder.js +75 -0
  188. package/dist/parsers/ocr/index.d.ts +4 -0
  189. package/dist/parsers/ocr/index.d.ts.map +1 -0
  190. package/dist/parsers/ocr/index.js +4 -0
  191. package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
  192. package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
  193. package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
  194. package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
  195. package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
  196. package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
  197. package/dist/parsers/ocr/types.d.ts +16 -0
  198. package/dist/parsers/ocr/types.d.ts.map +1 -0
  199. package/dist/parsers/ocr/types.js +4 -0
  200. package/dist/parsers/parser-test-fixtures.d.ts +28 -0
  201. package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
  202. package/dist/parsers/parser-test-fixtures.js +139 -0
  203. package/dist/parsers/pdf-parser.d.ts +43 -0
  204. package/dist/parsers/pdf-parser.d.ts.map +1 -0
  205. package/dist/parsers/pdf-parser.js +388 -0
  206. package/dist/parsers/registry.d.ts +8 -0
  207. package/dist/parsers/registry.d.ts.map +1 -0
  208. package/dist/parsers/registry.js +57 -0
  209. package/dist/parsers/text-parser.d.ts +3 -0
  210. package/dist/parsers/text-parser.d.ts.map +1 -0
  211. package/dist/parsers/text-parser.js +214 -0
  212. package/dist/parsers/types.d.ts +53 -0
  213. package/dist/parsers/types.d.ts.map +1 -0
  214. package/dist/parsers/types.js +21 -0
  215. package/dist/parsers/unsupported-parser.d.ts +4 -0
  216. package/dist/parsers/unsupported-parser.d.ts.map +1 -0
  217. package/dist/parsers/unsupported-parser.js +97 -0
  218. package/dist/parsers/xlsx-parser.d.ts +3 -0
  219. package/dist/parsers/xlsx-parser.d.ts.map +1 -0
  220. package/dist/parsers/xlsx-parser.js +425 -0
  221. package/dist/privacy/audit-emitter.d.ts +5 -0
  222. package/dist/privacy/audit-emitter.d.ts.map +1 -0
  223. package/dist/privacy/audit-emitter.js +93 -0
  224. package/dist/privacy/diagnostic-redactor.d.ts +2 -0
  225. package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
  226. package/dist/privacy/diagnostic-redactor.js +153 -0
  227. package/dist/privacy/index.d.ts +5 -0
  228. package/dist/privacy/index.d.ts.map +1 -0
  229. package/dist/privacy/index.js +6 -0
  230. package/dist/privacy/retention-applier.d.ts +5 -0
  231. package/dist/privacy/retention-applier.d.ts.map +1 -0
  232. package/dist/privacy/retention-applier.js +88 -0
  233. package/dist/privacy/types.d.ts +98 -0
  234. package/dist/privacy/types.d.ts.map +1 -0
  235. package/dist/privacy/types.js +12 -0
  236. package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
  237. package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
  238. package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
  239. package/dist/qualityIntelligence/index.d.ts +3 -0
  240. package/dist/qualityIntelligence/index.d.ts.map +1 -0
  241. package/dist/qualityIntelligence/index.js +5 -0
  242. package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
  243. package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
  244. package/dist/qualityIntelligence/qiHandoff.js +82 -0
  245. package/dist/retrieval/answer-grounding.d.ts +9 -0
  246. package/dist/retrieval/answer-grounding.d.ts.map +1 -0
  247. package/dist/retrieval/answer-grounding.js +31 -0
  248. package/dist/retrieval/context-pack-assembler.d.ts +24 -0
  249. package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
  250. package/dist/retrieval/context-pack-assembler.js +50 -0
  251. package/dist/retrieval/index.d.ts +6 -0
  252. package/dist/retrieval/index.d.ts.map +1 -0
  253. package/dist/retrieval/index.js +9 -0
  254. package/dist/retrieval/retrieval-runner.d.ts +10 -0
  255. package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
  256. package/dist/retrieval/retrieval-runner.js +163 -0
  257. package/dist/retrieval/scoped-vector-search.d.ts +24 -0
  258. package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
  259. package/dist/retrieval/scoped-vector-search.js +864 -0
  260. package/dist/retrieval/types.d.ts +28 -0
  261. package/dist/retrieval/types.d.ts.map +1 -0
  262. package/dist/retrieval/types.js +33 -0
  263. package/dist/section-path-hash.d.ts +3 -0
  264. package/dist/section-path-hash.d.ts.map +1 -0
  265. package/dist/section-path-hash.js +9 -0
  266. package/dist/source-lifecycle.d.ts +14 -0
  267. package/dist/source-lifecycle.d.ts.map +1 -0
  268. package/dist/source-lifecycle.js +155 -0
  269. package/dist/source-routing-validation.d.ts +11 -0
  270. package/dist/source-routing-validation.d.ts.map +1 -0
  271. package/dist/source-routing-validation.js +140 -0
  272. package/dist/store-content-cipher.d.ts +11 -0
  273. package/dist/store-content-cipher.d.ts.map +1 -0
  274. package/dist/store-content-cipher.js +67 -0
  275. package/dist/store-content-encryption.d.ts +12 -0
  276. package/dist/store-content-encryption.d.ts.map +1 -0
  277. package/dist/store-content-encryption.js +275 -0
  278. package/dist/store-paths.d.ts +6 -0
  279. package/dist/store-paths.d.ts.map +1 -0
  280. package/dist/store-paths.js +61 -0
  281. package/dist/store.d.ts +30 -0
  282. package/dist/store.d.ts.map +1 -0
  283. package/dist/store.js +219 -0
  284. package/dist/testing.d.ts +47 -0
  285. package/dist/testing.d.ts.map +1 -0
  286. package/dist/testing.js +170 -0
  287. package/dist/version.d.ts +2 -0
  288. package/dist/version.d.ts.map +1 -0
  289. package/dist/version.js +4 -0
  290. package/package.json +43 -0
@@ -0,0 +1,135 @@
1
+ // Prepared-statement helpers for the `extraction_checkpoints` table (Epic #1160, Issue #1286).
2
+ //
3
+ // One row per (capsule_id, document_id). The progressive large-document path upserts the row as a
4
+ // document advances through extraction → chunking → embedding so an interrupted job resumes from
5
+ // durable progress. The row is content-free: hashes, cursors, counts, and redacted diagnostics —
6
+ // never raw extracted text. The compatibility fingerprint columns let a resumed run refuse a
7
+ // checkpoint produced under an incompatible source, parser, policy, chunking strategy, or
8
+ // embedding identity.
9
+ const UPSERT_SQL = [
10
+ "INSERT INTO extraction_checkpoints (",
11
+ " capsule_id, document_id, job_id, strategy, phase, page_cursor, section_cursor,",
12
+ " object_cursor, extracted_text_bytes, chunk_cursor, embedded_chunk_cursor,",
13
+ " last_embedded_chunk_id, retry_count, coverage, source_content_hash, parser_version,",
14
+ " policy_fingerprint, chunking_strategy_version, embedding_identity_json,",
15
+ " terminal_diagnostics_json, created_at, updated_at",
16
+ ") VALUES (",
17
+ " :capsule_id, :document_id, :job_id, :strategy, :phase, :page_cursor, :section_cursor,",
18
+ " :object_cursor, :extracted_text_bytes, :chunk_cursor, :embedded_chunk_cursor,",
19
+ " :last_embedded_chunk_id, :retry_count, :coverage, :source_content_hash, :parser_version,",
20
+ " :policy_fingerprint, :chunking_strategy_version, :embedding_identity_json,",
21
+ " :terminal_diagnostics_json, :created_at, :updated_at",
22
+ ") ON CONFLICT(capsule_id, document_id) DO UPDATE SET",
23
+ " job_id = excluded.job_id,",
24
+ " strategy = excluded.strategy,",
25
+ " phase = excluded.phase,",
26
+ " page_cursor = excluded.page_cursor,",
27
+ " section_cursor = excluded.section_cursor,",
28
+ " object_cursor = excluded.object_cursor,",
29
+ " extracted_text_bytes = excluded.extracted_text_bytes,",
30
+ " chunk_cursor = excluded.chunk_cursor,",
31
+ " embedded_chunk_cursor = excluded.embedded_chunk_cursor,",
32
+ " last_embedded_chunk_id = excluded.last_embedded_chunk_id,",
33
+ " retry_count = excluded.retry_count,",
34
+ " coverage = excluded.coverage,",
35
+ " source_content_hash = excluded.source_content_hash,",
36
+ " parser_version = excluded.parser_version,",
37
+ " policy_fingerprint = excluded.policy_fingerprint,",
38
+ " chunking_strategy_version = excluded.chunking_strategy_version,",
39
+ " embedding_identity_json = excluded.embedding_identity_json,",
40
+ " terminal_diagnostics_json = excluded.terminal_diagnostics_json,",
41
+ " updated_at = excluded.updated_at",
42
+ ].join(" ");
43
+ const SELECT_ONE_SQL = [
44
+ "SELECT * FROM extraction_checkpoints",
45
+ "WHERE capsule_id = :c AND document_id = :d",
46
+ ].join(" ");
47
+ const SELECT_BY_CAPSULE_SQL = [
48
+ "SELECT * FROM extraction_checkpoints",
49
+ "WHERE capsule_id = :c",
50
+ "ORDER BY updated_at DESC, document_id ASC",
51
+ ].join(" ");
52
+ const DELETE_ONE_SQL = "DELETE FROM extraction_checkpoints WHERE capsule_id = :c AND document_id = :d";
53
+ function diagnosticsToJson(diagnostics) {
54
+ return JSON.stringify(diagnostics);
55
+ }
56
+ export function upsertExtractionCheckpoint(db, checkpoint) {
57
+ db.prepare(UPSERT_SQL).run({
58
+ capsule_id: String(checkpoint.capsuleId),
59
+ document_id: String(checkpoint.documentId),
60
+ job_id: checkpoint.jobId,
61
+ strategy: checkpoint.strategy,
62
+ phase: checkpoint.phase,
63
+ page_cursor: checkpoint.pageCursor,
64
+ section_cursor: checkpoint.sectionCursor,
65
+ object_cursor: checkpoint.objectCursor,
66
+ extracted_text_bytes: checkpoint.extractedTextBytes,
67
+ chunk_cursor: checkpoint.chunkCursor,
68
+ embedded_chunk_cursor: checkpoint.embeddedChunkCursor,
69
+ last_embedded_chunk_id: checkpoint.lastEmbeddedChunkId === undefined ? null : String(checkpoint.lastEmbeddedChunkId),
70
+ retry_count: checkpoint.retryCount,
71
+ coverage: checkpoint.coverage,
72
+ source_content_hash: checkpoint.fingerprint.sourceContentHash,
73
+ parser_version: checkpoint.fingerprint.parserVersion,
74
+ policy_fingerprint: checkpoint.fingerprint.policyFingerprint,
75
+ chunking_strategy_version: checkpoint.fingerprint.chunkingStrategyVersion,
76
+ embedding_identity_json: JSON.stringify(checkpoint.fingerprint.embeddingIdentity),
77
+ terminal_diagnostics_json: diagnosticsToJson(checkpoint.terminalDiagnostics),
78
+ created_at: checkpoint.createdAt,
79
+ updated_at: checkpoint.updatedAt,
80
+ });
81
+ }
82
+ function parseDiagnostics(json) {
83
+ try {
84
+ const parsed = JSON.parse(json);
85
+ return Array.isArray(parsed) ? parsed : [];
86
+ }
87
+ catch {
88
+ return [];
89
+ }
90
+ }
91
+ function parseEmbeddingIdentity(json) {
92
+ const parsed = JSON.parse(json);
93
+ return parsed;
94
+ }
95
+ function rowToCheckpoint(row) {
96
+ return {
97
+ capsuleId: row.capsule_id,
98
+ documentId: row.document_id,
99
+ jobId: row.job_id,
100
+ strategy: row.strategy,
101
+ phase: row.phase,
102
+ pageCursor: row.page_cursor,
103
+ sectionCursor: row.section_cursor,
104
+ objectCursor: row.object_cursor,
105
+ extractedTextBytes: row.extracted_text_bytes,
106
+ chunkCursor: row.chunk_cursor,
107
+ embeddedChunkCursor: row.embedded_chunk_cursor,
108
+ ...(row.last_embedded_chunk_id === null
109
+ ? {}
110
+ : { lastEmbeddedChunkId: row.last_embedded_chunk_id }),
111
+ retryCount: row.retry_count,
112
+ coverage: row.coverage,
113
+ fingerprint: {
114
+ sourceContentHash: row.source_content_hash,
115
+ parserVersion: row.parser_version,
116
+ policyFingerprint: row.policy_fingerprint,
117
+ chunkingStrategyVersion: row.chunking_strategy_version,
118
+ embeddingIdentity: parseEmbeddingIdentity(row.embedding_identity_json),
119
+ },
120
+ terminalDiagnostics: parseDiagnostics(row.terminal_diagnostics_json),
121
+ createdAt: row.created_at,
122
+ updatedAt: row.updated_at,
123
+ };
124
+ }
125
+ export function selectExtractionCheckpoint(db, capsuleId, documentId) {
126
+ const row = db.prepare(SELECT_ONE_SQL).get({ c: String(capsuleId), d: String(documentId) });
127
+ return row === undefined ? undefined : rowToCheckpoint(row);
128
+ }
129
+ export function listExtractionCheckpoints(db, capsuleId) {
130
+ const rows = db.prepare(SELECT_BY_CAPSULE_SQL).all({ c: String(capsuleId) });
131
+ return rows.map((row) => rowToCheckpoint(row));
132
+ }
133
+ export function deleteExtractionCheckpoint(db, capsuleId, documentId) {
134
+ db.prepare(DELETE_ONE_SQL).run({ c: String(capsuleId), d: String(documentId) });
135
+ }
@@ -0,0 +1,20 @@
1
+ import type { CheckpointFingerprint, CheckpointIncompatibilityReason, DocumentId, ExtractionCheckpointRecord, KnowledgeCapsuleId, LargeDocumentJobProgress } from "@oscharko-dev/keiko-contracts";
2
+ import type { KnowledgeStore } from "../store.js";
3
+ export type CheckpointResumeDecision = {
4
+ readonly kind: "no-checkpoint";
5
+ } | {
6
+ readonly kind: "resume";
7
+ readonly checkpoint: ExtractionCheckpointRecord;
8
+ } | {
9
+ readonly kind: "complete";
10
+ readonly checkpoint: ExtractionCheckpointRecord;
11
+ } | {
12
+ readonly kind: "incompatible";
13
+ readonly checkpoint: ExtractionCheckpointRecord;
14
+ readonly reasons: readonly CheckpointIncompatibilityReason[];
15
+ };
16
+ export declare function resolveExtractionResume(store: KnowledgeStore, capsuleId: KnowledgeCapsuleId, documentId: DocumentId, currentFingerprint: CheckpointFingerprint): CheckpointResumeDecision;
17
+ export declare function isResumableCheckpoint(checkpoint: ExtractionCheckpointRecord): boolean;
18
+ export declare function checkpointToProgress(checkpoint: ExtractionCheckpointRecord, safeDisplayName: string): LargeDocumentJobProgress;
19
+ export declare function listResumableDocuments(store: KnowledgeStore, capsuleId: KnowledgeCapsuleId): readonly DocumentId[];
20
+ //# sourceMappingURL=checkpoint-resume.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"checkpoint-resume.d.ts","sourceRoot":"","sources":["../../src/indexing/checkpoint-resume.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EACV,qBAAqB,EACrB,+BAA+B,EAC/B,UAAU,EACV,0BAA0B,EAC1B,kBAAkB,EAClB,wBAAwB,EACzB,MAAM,+BAA+B,CAAC;AAIvC,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,MAAM,MAAM,wBAAwB,GAChC;IAAE,QAAQ,CAAC,IAAI,EAAE,eAAe,CAAA;CAAE,GAClC;IAAE,QAAQ,CAAC,IAAI,EAAE,QAAQ,CAAC;IAAC,QAAQ,CAAC,UAAU,EAAE,0BAA0B,CAAA;CAAE,GAC5E;IAAE,QAAQ,CAAC,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,CAAC,UAAU,EAAE,0BAA0B,CAAA;CAAE,GAC9E;IACE,QAAQ,CAAC,IAAI,EAAE,cAAc,CAAC;IAC9B,QAAQ,CAAC,UAAU,EAAE,0BAA0B,CAAC;IAChD,QAAQ,CAAC,OAAO,EAAE,SAAS,+BAA+B,EAAE,CAAC;CAC9D,CAAC;AAEN,wBAAgB,uBAAuB,CACrC,KAAK,EAAE,cAAc,EACrB,SAAS,EAAE,kBAAkB,EAC7B,UAAU,EAAE,UAAU,EACtB,kBAAkB,EAAE,qBAAqB,GACxC,wBAAwB,CAa1B;AAID,wBAAgB,qBAAqB,CAAC,UAAU,EAAE,0BAA0B,GAAG,OAAO,CAGrF;AAED,wBAAgB,oBAAoB,CAClC,UAAU,EAAE,0BAA0B,EACtC,eAAe,EAAE,MAAM,GACtB,wBAAwB,CAc1B;AAGD,wBAAgB,sBAAsB,CACpC,KAAK,EAAE,cAAc,EACrB,SAAS,EAAE,kBAAkB,GAC5B,SAAS,UAAU,EAAE,CAIvB"}
@@ -0,0 +1,50 @@
1
+ // Resume decision for interrupted large-document jobs (Epic #1160, Issue #1286).
2
+ //
3
+ // Given a durable checkpoint and the current fingerprint (source hash, parser version, policy
4
+ // fingerprint, chunking strategy, embedding identity), decide whether a document can continue from
5
+ // where it stopped. A compatible non-terminal checkpoint resumes; an incompatible one restarts
6
+ // cleanly with a precise reason; a completed checkpoint is skipped.
7
+ import { checkpointCompatibility } from "@oscharko-dev/keiko-contracts";
8
+ import { listExtractionCheckpoints, selectExtractionCheckpoint } from "./checkpoint-persist.js";
9
+ export function resolveExtractionResume(store, capsuleId, documentId, currentFingerprint) {
10
+ const checkpoint = selectExtractionCheckpoint(store._internal.db, capsuleId, documentId);
11
+ if (checkpoint === undefined) {
12
+ return { kind: "no-checkpoint" };
13
+ }
14
+ const compat = checkpointCompatibility(checkpoint.fingerprint, currentFingerprint);
15
+ if (!compat.compatible) {
16
+ return { kind: "incompatible", checkpoint, reasons: compat.reasons };
17
+ }
18
+ if (checkpoint.phase === "complete") {
19
+ return { kind: "complete", checkpoint };
20
+ }
21
+ return { kind: "resume", checkpoint };
22
+ }
23
+ // A checkpoint is resumable for UI/BFF purposes when it carries progress that is neither complete
24
+ // nor a hard failure (i.e. it stopped mid-flight or was cancelled).
25
+ export function isResumableCheckpoint(checkpoint) {
26
+ if (checkpoint.phase === "complete" || checkpoint.phase === "failed")
27
+ return false;
28
+ return true;
29
+ }
30
+ export function checkpointToProgress(checkpoint, safeDisplayName) {
31
+ return {
32
+ documentId: checkpoint.documentId,
33
+ safeDisplayName,
34
+ strategy: checkpoint.strategy,
35
+ phase: checkpoint.phase,
36
+ processedPages: checkpoint.pageCursor,
37
+ extractedTextBytes: checkpoint.extractedTextBytes,
38
+ chunkCount: checkpoint.chunkCursor,
39
+ embeddedChunkCount: checkpoint.embeddedChunkCursor,
40
+ retryCount: checkpoint.retryCount,
41
+ coverage: checkpoint.coverage,
42
+ resumable: isResumableCheckpoint(checkpoint),
43
+ };
44
+ }
45
+ // Lists every document in the capsule that holds a resumable (mid-flight or cancelled) checkpoint.
46
+ export function listResumableDocuments(store, capsuleId) {
47
+ return listExtractionCheckpoints(store._internal.db, capsuleId)
48
+ .filter(isResumableCheckpoint)
49
+ .map((checkpoint) => checkpoint.documentId);
50
+ }
@@ -0,0 +1,3 @@
1
+ import { type ChunkToEmbed, type EmbedBatchOptions, type EmbedBatchResult } from "./types.js";
2
+ export declare function embedChunkBatch(chunks: readonly ChunkToEmbed[], options: EmbedBatchOptions): Promise<EmbedBatchResult>;
3
+ //# sourceMappingURL=embedding-batcher.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embedding-batcher.d.ts","sourceRoot":"","sources":["../../src/indexing/embedding-batcher.ts"],"names":[],"mappings":"AA8BA,OAAO,EAEL,KAAK,YAAY,EACjB,KAAK,iBAAiB,EACtB,KAAK,gBAAgB,EACtB,MAAM,YAAY,CAAC;AAsdpB,wBAAsB,eAAe,CACnC,MAAM,EAAE,SAAS,YAAY,EAAE,EAC/B,OAAO,EAAE,iBAAiB,GACzB,OAAO,CAAC,gBAAgB,CAAC,CA0B3B"}
@@ -0,0 +1,390 @@
1
+ // Embedding batcher (Epic #189, Issue #196). Given a batch of chunks already projected to
2
+ // their excerpt text, this module:
3
+ //
4
+ // 1. Issues N concurrent requests through the injected `OpenAIEmbeddingAdapter`, bounded
5
+ // by `EmbedBatchOptions.concurrency` (hard-capped to 4 by the orchestrator).
6
+ // 2. For EACH successful response, computes the adapter's reported identity and runs
7
+ // `assertCompatibleEmbeddingIdentity` against the capsule's pinned identity. The first
8
+ // structural mismatch aborts the batch with `INCOMPATIBLE_EMBEDDING_IDENTITY` and the
9
+ // orchestrator marks the job as failed — NO vectors from the batch are persisted.
10
+ // 3. Persists the surviving chunks' embeddings inside a single transaction so a partial
11
+ // batch failure cannot leave vectors and chunks out of sync.
12
+ //
13
+ // The identity check is the load-bearing invariant from #192. Removing it would let a
14
+ // capsule pinned to dim=1536 silently accept dim=768 rows — see test #5.
15
+ import { assertCompatibleEmbeddingIdentity } from "@oscharko-dev/keiko-model-gateway";
16
+ import { composeVectorRecord, insertVectorRow } from "./vector-persist.js";
17
+ import { IndexingError, } from "./types.js";
18
+ import { chunkDedupeKey } from "../chunking/chunker.js";
19
+ // ─── Concurrency primitive ───────────────────────────────────────────────────
20
+ // Hand-rolled bounded-concurrency runner. Avoids pulling in `p-limit` (the local-knowledge
21
+ // package's runtime-deps surface stays narrow per ADR-0019-3e). Order of `inputs` is
22
+ // preserved in `outputs` even though completion order may differ.
23
+ async function runBounded(inputs, concurrency, work) {
24
+ const limit = Math.max(1, Math.floor(concurrency));
25
+ const outputs = new Array(inputs.length);
26
+ let nextIndex = 0;
27
+ async function worker() {
28
+ while (nextIndex < inputs.length) {
29
+ const i = nextIndex;
30
+ nextIndex += 1;
31
+ const input = inputs[i];
32
+ outputs[i] = await work(input, i);
33
+ }
34
+ }
35
+ const workers = [];
36
+ for (let i = 0; i < Math.min(limit, inputs.length); i += 1) {
37
+ workers.push(worker());
38
+ }
39
+ await Promise.all(workers);
40
+ return outputs;
41
+ }
42
+ // ─── Adapter → outcome plumbing ──────────────────────────────────────────────
43
+ // `modelId` is required by the OpenAIEmbeddingRequest contract; the batcher fills it from
44
+ // the capsule's pinned identity so a single batch never queries multiple models. The
45
+ // adapter's `signal`, `apiKeyHeaderName`, and timeout defaults are honoured via the
46
+ // optional-spread pattern (the strict `exactOptionalPropertyTypes` mode forbids passing
47
+ // `undefined` for an optional property).
48
+ async function embedSingleChunkWithModel(adapter, chunk, pinnedIdentity, signal) {
49
+ return adapter.request({
50
+ endpoint: adapter.endpoint,
51
+ apiKey: adapter.apiKey,
52
+ ...(adapter.apiKeyHeaderName !== undefined
53
+ ? { apiKeyHeaderName: adapter.apiKeyHeaderName }
54
+ : {}),
55
+ modelId: pinnedIdentity.modelId,
56
+ input: chunk.text,
57
+ ...(signal !== undefined ? { signal } : {}),
58
+ });
59
+ }
60
+ // ─── Transient-failure retry ─────────────────────────────────────────────────
61
+ // Only network-flavoured failures are worth retrying. Auth (`wrong-header`),
62
+ // `unsupported-model`, and `invalid-response` are deterministic — retrying them burns
63
+ // the budget without any chance of recovery. `cancelled` is the caller's own abort.
64
+ const TRANSIENT_EMBED_KINDS = new Set(["rate-limited", "timeout", "transport"]);
65
+ const DEFAULT_EMBED_MAX_RETRIES = 6;
66
+ const DEFAULT_EMBED_BASE_DELAY_MS = 500;
67
+ const MAX_EMBED_BACKOFF_MS = 30_000;
68
+ function isTransientOutcome(outcome) {
69
+ return !outcome.ok && TRANSIENT_EMBED_KINDS.has(outcome.kind);
70
+ }
71
+ function backoffMs(attempt, base) {
72
+ return Math.min(base * 2 ** (attempt - 1), MAX_EMBED_BACKOFF_MS);
73
+ }
74
+ // Cancellable default sleep. Rejects on abort so the retry loop can bail out of its backoff
75
+ // the moment the caller cancels rather than waiting out the full delay.
76
+ function defaultSleep(ms, signal) {
77
+ if (signal?.aborted === true) {
78
+ return Promise.reject(new DOMException("aborted", "AbortError"));
79
+ }
80
+ return new Promise((resolve, reject) => {
81
+ const timer = setTimeout(() => {
82
+ signal?.removeEventListener("abort", onAbort);
83
+ resolve();
84
+ }, ms);
85
+ function onAbort() {
86
+ clearTimeout(timer);
87
+ reject(new DOMException("aborted", "AbortError"));
88
+ }
89
+ signal?.addEventListener("abort", onAbort, { once: true });
90
+ });
91
+ }
92
+ function resolveRetry(retry) {
93
+ return {
94
+ maxRetries: retry?.maxRetries ?? DEFAULT_EMBED_MAX_RETRIES,
95
+ baseDelayMs: retry?.baseDelayMs ?? DEFAULT_EMBED_BASE_DELAY_MS,
96
+ sleep: retry?.sleep ?? defaultSleep,
97
+ };
98
+ }
99
+ async function embedChunkWithRetry(options, chunk) {
100
+ const retry = resolveRetry(options.retry);
101
+ let outcome = await embedSingleChunkWithModel(options.adapter, chunk, options.pinnedIdentity, options.signal);
102
+ for (let attempt = 1; attempt <= retry.maxRetries; attempt += 1) {
103
+ if (!isTransientOutcome(outcome) || options.signal?.aborted === true) {
104
+ return outcome;
105
+ }
106
+ try {
107
+ await retry.sleep(backoffMs(attempt, retry.baseDelayMs), options.signal);
108
+ }
109
+ catch {
110
+ return outcome; // aborted mid-backoff; the abort gate converts this to CANCELLED
111
+ }
112
+ outcome = await embedSingleChunkWithModel(options.adapter, chunk, options.pinnedIdentity, options.signal);
113
+ }
114
+ return outcome;
115
+ }
116
+ // ─── Array-batch embedding (#189 GRD-004) ────────────────────────────────────
117
+ // When the adapter exposes `requestBatch`, embed many unique chunks per HTTP round-trip.
118
+ // Items per request are bounded so a single response stays well under the gateway's 10 MB
119
+ // JSON cap (96 × 3072 float32 ≈ 4.4 MB) and inside provider per-request token limits.
120
+ const BATCH_ITEM_CAP = 96;
121
+ const BATCH_CHAR_CAP = 120_000;
122
+ function groupIntoBatches(requests) {
123
+ const batches = [];
124
+ let current = [];
125
+ let currentChars = 0;
126
+ for (const request of requests) {
127
+ const len = request.representative.text.length;
128
+ if (current.length > 0 &&
129
+ (current.length >= BATCH_ITEM_CAP || currentChars + len > BATCH_CHAR_CAP)) {
130
+ batches.push(current);
131
+ current = [];
132
+ currentChars = 0;
133
+ }
134
+ current.push(request);
135
+ currentChars += len;
136
+ }
137
+ if (current.length > 0)
138
+ batches.push(current);
139
+ return batches;
140
+ }
141
+ function isTransientBatchOutcome(outcome) {
142
+ return !outcome.ok && TRANSIENT_EMBED_KINDS.has(outcome.kind);
143
+ }
144
+ function errorFromKind(kind) {
145
+ return { code: "EMBEDDING_ADAPTER_FAILED", message: `embedding adapter returned ${kind}` };
146
+ }
147
+ async function embedArrayBatchWithRetry(options, inputs) {
148
+ const adapter = options.adapter;
149
+ const requestBatch = adapter.requestBatch;
150
+ if (requestBatch === undefined) {
151
+ return { ok: false, kind: "transport" };
152
+ }
153
+ const base = {
154
+ endpoint: adapter.endpoint,
155
+ apiKey: adapter.apiKey,
156
+ ...(adapter.apiKeyHeaderName !== undefined
157
+ ? { apiKeyHeaderName: adapter.apiKeyHeaderName }
158
+ : {}),
159
+ modelId: options.pinnedIdentity.modelId,
160
+ inputs,
161
+ ...(options.signal !== undefined ? { signal: options.signal } : {}),
162
+ };
163
+ const retry = resolveRetry(options.retry);
164
+ let outcome = await requestBatch(base);
165
+ for (let attempt = 1; attempt <= retry.maxRetries; attempt += 1) {
166
+ if (!isTransientBatchOutcome(outcome) || options.signal?.aborted === true) {
167
+ return outcome;
168
+ }
169
+ try {
170
+ await retry.sleep(backoffMs(attempt, retry.baseDelayMs), options.signal);
171
+ }
172
+ catch {
173
+ return outcome; // aborted mid-backoff; the abort gate converts this to CANCELLED
174
+ }
175
+ outcome = await requestBatch(base);
176
+ }
177
+ return outcome;
178
+ }
179
+ // Apply the per-vector identity gate exactly as the scalar path does. Order-independent:
180
+ // once `state.identityFailure` is set, embedChunkBatch persists nothing, so which concurrent
181
+ // batch first observes the drift is irrelevant to the outcome.
182
+ function gateVectorOutcome(representative, success, options, state) {
183
+ if (state.identityFailure !== undefined) {
184
+ return { ok: false, chunk: representative, error: state.identityFailure };
185
+ }
186
+ const observed = identityFromAdapter(options.pinnedIdentity, success);
187
+ const compat = assertCompatibleEmbeddingIdentity(options.pinnedIdentity, observed);
188
+ if (!compat.ok) {
189
+ state.identityFailure = {
190
+ code: "INCOMPATIBLE_EMBEDDING_IDENTITY",
191
+ message: compat.safeMessage,
192
+ };
193
+ return { ok: false, chunk: representative, error: state.identityFailure };
194
+ }
195
+ return { ok: true, chunk: representative, success };
196
+ }
197
+ async function embedUniqueBatch(batch, options, state) {
198
+ if (state.identityFailure !== undefined) {
199
+ const failure = state.identityFailure;
200
+ return batch.map((r) => ({ ok: false, chunk: r.representative, error: failure }));
201
+ }
202
+ const abortError = checkAbort(options.signal);
203
+ if (abortError !== undefined) {
204
+ return batch.map((r) => ({ ok: false, chunk: r.representative, error: abortError }));
205
+ }
206
+ const outcome = await embedArrayBatchWithRetry(options, batch.map((r) => r.representative.text));
207
+ if (!outcome.ok) {
208
+ const error = errorFromKind(outcome.kind);
209
+ return batch.map((r) => ({ ok: false, chunk: r.representative, error }));
210
+ }
211
+ return batch.map((request, i) => {
212
+ const success = outcome.value[i];
213
+ if (success === undefined) {
214
+ return {
215
+ ok: false,
216
+ chunk: request.representative,
217
+ error: errorFromKind("invalid-response"),
218
+ };
219
+ }
220
+ return gateVectorOutcome(request.representative, success, options, state);
221
+ });
222
+ }
223
+ // ─── Identity verification ───────────────────────────────────────────────────
224
+ function identityFromAdapter(pinned, success) {
225
+ // `provider` and `vectorMetric` are not echoed by the OpenAI API response — they come from
226
+ // the operator's pinned identity. Only `modelId`, `modelRevision`, and `vectorDimensions`
227
+ // are observed from the adapter's outcome. Identity-compatibility checks the structural
228
+ // tuple (provider+modelId+dims+metric), so the constructed identity only loses fidelity
229
+ // on `modelRevision` (which the compatibility check treats as a warning, not a failure).
230
+ return {
231
+ provider: pinned.provider,
232
+ modelId: success.modelId,
233
+ vectorDimensions: success.vector.length,
234
+ vectorMetric: pinned.vectorMetric,
235
+ ...(success.modelRevision !== undefined ? { modelRevision: success.modelRevision } : {}),
236
+ };
237
+ }
238
+ // ─── Float32 → byte serialisation ────────────────────────────────────────────
239
+ // The schema column is BLOB; SQLite expects a Uint8Array. We copy the underlying
240
+ // ArrayBuffer rather than aliasing it (Float32Array views can share buffers) so the
241
+ // persisted row is a stable copy not affected by any later vector reuse.
242
+ function floatToBytes(vector) {
243
+ return new Uint8Array(vector.buffer.slice(vector.byteOffset, vector.byteOffset + vector.byteLength));
244
+ }
245
+ function dedupeEmbeddingRequests(chunks) {
246
+ const byKey = new Map();
247
+ for (const chunk of chunks) {
248
+ const key = chunkDedupeKey(chunk.text) ?? `chunk:${String(chunk.id)}`;
249
+ const existing = byKey.get(key);
250
+ if (existing === undefined) {
251
+ byKey.set(key, { representative: chunk, chunks: [chunk] });
252
+ }
253
+ else {
254
+ existing.chunks.push(chunk);
255
+ }
256
+ }
257
+ return [...byKey.entries()].map(([key, value]) => ({ key, ...value }));
258
+ }
259
+ function outcomeForChunk(outcome, chunk) {
260
+ if (outcome.ok) {
261
+ return { ok: true, chunk, success: outcome.success };
262
+ }
263
+ return { ok: false, chunk, error: outcome.error };
264
+ }
265
+ function errorFromOutcome(outcome) {
266
+ return {
267
+ code: "EMBEDDING_ADAPTER_FAILED",
268
+ message: `embedding adapter returned ${outcome.kind}`,
269
+ };
270
+ }
271
+ function checkAbort(signal) {
272
+ if (signal?.aborted === true) {
273
+ return { code: "CANCELLED", message: "indexing aborted via AbortSignal" };
274
+ }
275
+ return undefined;
276
+ }
277
+ async function buildUniqueChunkOutcome(request, options, state) {
278
+ if (state.identityFailure !== undefined) {
279
+ return { ok: false, chunk: request.representative, error: state.identityFailure };
280
+ }
281
+ const abortError = checkAbort(options.signal);
282
+ if (abortError !== undefined) {
283
+ return { ok: false, chunk: request.representative, error: abortError };
284
+ }
285
+ const outcome = await embedChunkWithRetry(options, request.representative);
286
+ if (!outcome.ok) {
287
+ return { ok: false, chunk: request.representative, error: errorFromOutcome(outcome) };
288
+ }
289
+ const observed = identityFromAdapter(options.pinnedIdentity, outcome.value);
290
+ const compat = assertCompatibleEmbeddingIdentity(options.pinnedIdentity, observed);
291
+ if (!compat.ok) {
292
+ state.identityFailure = {
293
+ code: "INCOMPATIBLE_EMBEDDING_IDENTITY",
294
+ message: compat.safeMessage,
295
+ };
296
+ return { ok: false, chunk: request.representative, error: state.identityFailure };
297
+ }
298
+ return { ok: true, chunk: request.representative, success: outcome.value };
299
+ }
300
+ function expandUniqueOutcomes(uniqueRequests, uniqueOutcomes) {
301
+ const outcomes = [];
302
+ for (let i = 0; i < uniqueRequests.length; i += 1) {
303
+ const request = uniqueRequests[i];
304
+ const outcome = uniqueOutcomes[i];
305
+ if (request === undefined || outcome === undefined)
306
+ continue;
307
+ for (const chunk of request.chunks) {
308
+ outcomes.push(outcomeForChunk(outcome, chunk));
309
+ }
310
+ }
311
+ return outcomes;
312
+ }
313
+ // Build all per-chunk outcomes BEFORE we open a write transaction. The identity gate runs
314
+ // after every successful response so we fail fast on dimension mismatch.
315
+ async function buildChunkOutcomes(chunks, options) {
316
+ const state = { identityFailure: undefined };
317
+ const uniqueRequests = dedupeEmbeddingRequests(chunks);
318
+ let uniqueOutcomes;
319
+ if (typeof options.adapter.requestBatch === "function") {
320
+ // Array-batch path: collapse the unique requests into ceil(N / itemCap) HTTP calls,
321
+ // run those calls with bounded concurrency, then flatten back into request order.
322
+ const batches = groupIntoBatches(uniqueRequests);
323
+ const batchOutcomes = await runBounded(batches, options.concurrency, async (batch) => embedUniqueBatch(batch, options, state));
324
+ uniqueOutcomes = batchOutcomes.flat();
325
+ }
326
+ else {
327
+ // Scalar fallback (adapters/stubs without `requestBatch`): one HTTP call per unique chunk.
328
+ uniqueOutcomes = await runBounded(uniqueRequests, options.concurrency, async (request) => {
329
+ return buildUniqueChunkOutcome(request, options, state);
330
+ });
331
+ }
332
+ const outcomes = expandUniqueOutcomes(uniqueRequests, uniqueOutcomes);
333
+ return state.identityFailure === undefined
334
+ ? { outcomes }
335
+ : { outcomes, identityFailure: state.identityFailure };
336
+ }
337
+ // ─── Persistence boundary ─────────────────────────────────────────────────────
338
+ // Wraps the row inserts in a single transaction so a partial INSERT failure rolls back the
339
+ // whole batch. The orchestrator marks the document as failed; subsequent runs can retry.
340
+ function persistOutcomes(store, outcomes, pinnedIdentity, idSource, now) {
341
+ const db = store._internal.db;
342
+ const persisted = [];
343
+ db.exec("BEGIN");
344
+ try {
345
+ for (const out of outcomes) {
346
+ if (!out.ok)
347
+ continue;
348
+ const observed = identityFromAdapter(pinnedIdentity, out.success);
349
+ const row = {
350
+ id: `vec:${String(out.chunk.id)}`,
351
+ capsuleId: out.chunk.capsuleId,
352
+ sourceId: out.chunk.sourceId,
353
+ documentId: out.chunk.documentId,
354
+ chunkId: out.chunk.id,
355
+ embedding: floatToBytes(out.success.vector),
356
+ identity: observed,
357
+ storageReference: idSource(),
358
+ createdAt: now(),
359
+ };
360
+ insertVectorRow(db, store._internal.contentCipher, row);
361
+ persisted.push(composeVectorRecord(row));
362
+ }
363
+ db.exec("COMMIT");
364
+ }
365
+ catch (cause) {
366
+ db.exec("ROLLBACK");
367
+ throw new IndexingError("PERSISTENCE_FAILED", "vector persistence failed mid-batch", cause === undefined ? undefined : { cause });
368
+ }
369
+ return persisted;
370
+ }
371
+ // ─── Public entrypoint ───────────────────────────────────────────────────────
372
+ export async function embedChunkBatch(chunks, options) {
373
+ if (chunks.length === 0) {
374
+ return { vectors: [], errors: [] };
375
+ }
376
+ const { outcomes, identityFailure } = await buildChunkOutcomes(chunks, options);
377
+ const errors = outcomes
378
+ .filter((o) => !o.ok)
379
+ .map((o) => o.error);
380
+ // Identity drift OR cancellation: refuse to persist ANY row from this batch.
381
+ if (identityFailure !== undefined) {
382
+ return { vectors: [], errors };
383
+ }
384
+ const abortError = checkAbort(options.signal);
385
+ if (abortError !== undefined) {
386
+ return { vectors: [], errors: [...errors, abortError] };
387
+ }
388
+ const vectors = persistOutcomes(options.store, outcomes, options.pinnedIdentity, options.idSource, options.now);
389
+ return { vectors, errors };
390
+ }
@@ -0,0 +1,7 @@
1
+ export { runIndexingJob } from "./orchestrator.js";
2
+ export { embedChunkBatch } from "./embedding-batcher.js";
3
+ export { findResumableJob } from "./job-resume.js";
4
+ export { upsertExtractionCheckpoint, selectExtractionCheckpoint, listExtractionCheckpoints, deleteExtractionCheckpoint, } from "./checkpoint-persist.js";
5
+ export { resolveExtractionResume, isResumableCheckpoint, checkpointToProgress, listResumableDocuments, type CheckpointResumeDecision, } from "./checkpoint-resume.js";
6
+ export { DEFAULT_INDEXING_BATCH_SIZE, DEFAULT_INDEXING_CONCURRENCY, IndexingError, type ChunkToEmbed, type EmbedBatchOptions, type EmbedBatchResult, type EmbedRetryOptions, type IndexingDocumentChunkedEvent, type IndexingDocumentDiscoveredEvent, type IndexingDocumentEmbeddedEvent, type IndexingDocumentExtractedEvent, type IndexingDocumentFailedEvent, type IndexingDocumentSkippedEvent, type IndexingErrorCode, type IndexingEvent, type IndexingJobCancelledEvent, type IndexingJobCompletedEvent, type IndexingJobFailedEvent, type IndexingJobStartedEvent, type IndexingOptions, type IndexingResult, } from "./types.js";
7
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/indexing/index.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAEnD,OAAO,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AAEzD,OAAO,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AAGnD,OAAO,EACL,0BAA0B,EAC1B,0BAA0B,EAC1B,yBAAyB,EACzB,0BAA0B,GAC3B,MAAM,yBAAyB,CAAC;AACjC,OAAO,EACL,uBAAuB,EACvB,qBAAqB,EACrB,oBAAoB,EACpB,sBAAsB,EACtB,KAAK,wBAAwB,GAC9B,MAAM,wBAAwB,CAAC;AAEhC,OAAO,EACL,2BAA2B,EAC3B,4BAA4B,EAC5B,aAAa,EACb,KAAK,YAAY,EACjB,KAAK,iBAAiB,EACtB,KAAK,gBAAgB,EACrB,KAAK,iBAAiB,EACtB,KAAK,4BAA4B,EACjC,KAAK,+BAA+B,EACpC,KAAK,6BAA6B,EAClC,KAAK,8BAA8B,EACnC,KAAK,2BAA2B,EAChC,KAAK,4BAA4B,EACjC,KAAK,iBAAiB,EACtB,KAAK,aAAa,EAClB,KAAK,yBAAyB,EAC9B,KAAK,yBAAyB,EAC9B,KAAK,sBAAsB,EAC3B,KAAK,uBAAuB,EAC5B,KAAK,eAAe,EACpB,KAAK,cAAc,GACpB,MAAM,YAAY,CAAC"}
@@ -0,0 +1,11 @@
1
+ // Public surface of the indexing layer (Epic #189, Issue #196). Composed by the package
2
+ // barrel in ../index.ts; consumers outside this package never import from this
3
+ // subdirectory directly (ADR-0019 direction rule 3e + the trust-8 test-support naming
4
+ // convention).
5
+ export { runIndexingJob } from "./orchestrator.js";
6
+ export { embedChunkBatch } from "./embedding-batcher.js";
7
+ export { findResumableJob } from "./job-resume.js";
8
+ // Bounded large-document ingestion checkpoints + resume (Epic #1160, Issue #1286).
9
+ export { upsertExtractionCheckpoint, selectExtractionCheckpoint, listExtractionCheckpoints, deleteExtractionCheckpoint, } from "./checkpoint-persist.js";
10
+ export { resolveExtractionResume, isResumableCheckpoint, checkpointToProgress, listResumableDocuments, } from "./checkpoint-resume.js";
11
+ export { DEFAULT_INDEXING_BATCH_SIZE, DEFAULT_INDEXING_CONCURRENCY, IndexingError, } from "./types.js";