@oscharko-dev/keiko-local-knowledge 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/dist/.tsbuildinfo +1 -0
  2. package/dist/bounded-document-extraction.d.ts +27 -0
  3. package/dist/bounded-document-extraction.d.ts.map +1 -0
  4. package/dist/bounded-document-extraction.js +214 -0
  5. package/dist/capsule-lifecycle.d.ts +33 -0
  6. package/dist/capsule-lifecycle.d.ts.map +1 -0
  7. package/dist/capsule-lifecycle.js +292 -0
  8. package/dist/capsule-set-lifecycle.d.ts +15 -0
  9. package/dist/capsule-set-lifecycle.d.ts.map +1 -0
  10. package/dist/capsule-set-lifecycle.js +158 -0
  11. package/dist/chunking/chunker-persist.d.ts +36 -0
  12. package/dist/chunking/chunker-persist.d.ts.map +1 -0
  13. package/dist/chunking/chunker-persist.js +74 -0
  14. package/dist/chunking/chunker-runner.d.ts +9 -0
  15. package/dist/chunking/chunker-runner.d.ts.map +1 -0
  16. package/dist/chunking/chunker-runner.js +218 -0
  17. package/dist/chunking/chunker.d.ts +7 -0
  18. package/dist/chunking/chunker.d.ts.map +1 -0
  19. package/dist/chunking/chunker.js +139 -0
  20. package/dist/chunking/citation-mapper.d.ts +4 -0
  21. package/dist/chunking/citation-mapper.d.ts.map +1 -0
  22. package/dist/chunking/citation-mapper.js +180 -0
  23. package/dist/chunking/index.d.ts +6 -0
  24. package/dist/chunking/index.d.ts.map +1 -0
  25. package/dist/chunking/index.js +8 -0
  26. package/dist/chunking/token-estimator.d.ts +3 -0
  27. package/dist/chunking/token-estimator.d.ts.map +1 -0
  28. package/dist/chunking/token-estimator.js +26 -0
  29. package/dist/chunking/types.d.ts +49 -0
  30. package/dist/chunking/types.d.ts.map +1 -0
  31. package/dist/chunking/types.js +26 -0
  32. package/dist/composition.d.ts +57 -0
  33. package/dist/composition.d.ts.map +1 -0
  34. package/dist/composition.js +310 -0
  35. package/dist/conversation/citation-attacher.d.ts +8 -0
  36. package/dist/conversation/citation-attacher.d.ts.map +1 -0
  37. package/dist/conversation/citation-attacher.js +55 -0
  38. package/dist/conversation/citation-excerpts.d.ts +4 -0
  39. package/dist/conversation/citation-excerpts.d.ts.map +1 -0
  40. package/dist/conversation/citation-excerpts.js +41 -0
  41. package/dist/conversation/grounded-answer-runner.d.ts +9 -0
  42. package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
  43. package/dist/conversation/grounded-answer-runner.js +61 -0
  44. package/dist/conversation/index.d.ts +5 -0
  45. package/dist/conversation/index.d.ts.map +1 -0
  46. package/dist/conversation/index.js +7 -0
  47. package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
  48. package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
  49. package/dist/conversation/model-gateway-answer-generator.js +105 -0
  50. package/dist/conversation/types.d.ts +35 -0
  51. package/dist/conversation/types.d.ts.map +1 -0
  52. package/dist/conversation/types.js +24 -0
  53. package/dist/discovery/discovery-runner.d.ts +23 -0
  54. package/dist/discovery/discovery-runner.d.ts.map +1 -0
  55. package/dist/discovery/discovery-runner.js +109 -0
  56. package/dist/discovery/extract-progressive.d.ts +17 -0
  57. package/dist/discovery/extract-progressive.d.ts.map +1 -0
  58. package/dist/discovery/extract-progressive.js +522 -0
  59. package/dist/discovery/extract.d.ts +26 -0
  60. package/dist/discovery/extract.d.ts.map +1 -0
  61. package/dist/discovery/extract.js +906 -0
  62. package/dist/discovery/glob.d.ts +10 -0
  63. package/dist/discovery/glob.d.ts.map +1 -0
  64. package/dist/discovery/glob.js +72 -0
  65. package/dist/discovery/index.d.ts +6 -0
  66. package/dist/discovery/index.d.ts.map +1 -0
  67. package/dist/discovery/index.js +8 -0
  68. package/dist/discovery/media-type.d.ts +4 -0
  69. package/dist/discovery/media-type.d.ts.map +1 -0
  70. package/dist/discovery/media-type.js +62 -0
  71. package/dist/discovery/persist.d.ts +63 -0
  72. package/dist/discovery/persist.d.ts.map +1 -0
  73. package/dist/discovery/persist.js +345 -0
  74. package/dist/discovery/test-support.d.ts +16 -0
  75. package/dist/discovery/test-support.d.ts.map +1 -0
  76. package/dist/discovery/test-support.js +127 -0
  77. package/dist/discovery/types.d.ts +63 -0
  78. package/dist/discovery/types.d.ts.map +1 -0
  79. package/dist/discovery/types.js +28 -0
  80. package/dist/discovery/walk.d.ts +12 -0
  81. package/dist/discovery/walk.d.ts.map +1 -0
  82. package/dist/discovery/walk.js +302 -0
  83. package/dist/errors.d.ts +13 -0
  84. package/dist/errors.d.ts.map +1 -0
  85. package/dist/errors.js +22 -0
  86. package/dist/evaluations/dimensions.d.ts +14 -0
  87. package/dist/evaluations/dimensions.d.ts.map +1 -0
  88. package/dist/evaluations/dimensions.js +191 -0
  89. package/dist/evaluations/fixtures.d.ts +18 -0
  90. package/dist/evaluations/fixtures.d.ts.map +1 -0
  91. package/dist/evaluations/fixtures.js +858 -0
  92. package/dist/evaluations/index.d.ts +7 -0
  93. package/dist/evaluations/index.d.ts.map +1 -0
  94. package/dist/evaluations/index.js +10 -0
  95. package/dist/evaluations/report.d.ts +3 -0
  96. package/dist/evaluations/report.d.ts.map +1 -0
  97. package/dist/evaluations/report.js +31 -0
  98. package/dist/evaluations/runner-seed.d.ts +12 -0
  99. package/dist/evaluations/runner-seed.d.ts.map +1 -0
  100. package/dist/evaluations/runner-seed.js +175 -0
  101. package/dist/evaluations/runner.d.ts +8 -0
  102. package/dist/evaluations/runner.d.ts.map +1 -0
  103. package/dist/evaluations/runner.js +205 -0
  104. package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
  105. package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
  106. package/dist/evaluations/scripted-embedding-adapter.js +163 -0
  107. package/dist/evaluations/types.d.ts +116 -0
  108. package/dist/evaluations/types.d.ts.map +1 -0
  109. package/dist/evaluations/types.js +27 -0
  110. package/dist/index.d.ts +23 -0
  111. package/dist/index.d.ts.map +1 -0
  112. package/dist/index.js +41 -0
  113. package/dist/indexing/bounded-indexing.d.ts +41 -0
  114. package/dist/indexing/bounded-indexing.d.ts.map +1 -0
  115. package/dist/indexing/bounded-indexing.js +240 -0
  116. package/dist/indexing/checkpoint-persist.d.ts +8 -0
  117. package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
  118. package/dist/indexing/checkpoint-persist.js +135 -0
  119. package/dist/indexing/checkpoint-resume.d.ts +20 -0
  120. package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
  121. package/dist/indexing/checkpoint-resume.js +50 -0
  122. package/dist/indexing/embedding-batcher.d.ts +3 -0
  123. package/dist/indexing/embedding-batcher.d.ts.map +1 -0
  124. package/dist/indexing/embedding-batcher.js +390 -0
  125. package/dist/indexing/index.d.ts +7 -0
  126. package/dist/indexing/index.d.ts.map +1 -0
  127. package/dist/indexing/index.js +11 -0
  128. package/dist/indexing/job-persist.d.ts +46 -0
  129. package/dist/indexing/job-persist.d.ts.map +1 -0
  130. package/dist/indexing/job-persist.js +157 -0
  131. package/dist/indexing/job-resume.d.ts +4 -0
  132. package/dist/indexing/job-resume.d.ts.map +1 -0
  133. package/dist/indexing/job-resume.js +14 -0
  134. package/dist/indexing/orchestrator.d.ts +3 -0
  135. package/dist/indexing/orchestrator.d.ts.map +1 -0
  136. package/dist/indexing/orchestrator.js +1151 -0
  137. package/dist/indexing/types.d.ts +156 -0
  138. package/dist/indexing/types.d.ts.map +1 -0
  139. package/dist/indexing/types.js +30 -0
  140. package/dist/indexing/vector-persist.d.ts +32 -0
  141. package/dist/indexing/vector-persist.d.ts.map +1 -0
  142. package/dist/indexing/vector-persist.js +105 -0
  143. package/dist/parsers/_internal.d.ts +20 -0
  144. package/dist/parsers/_internal.d.ts.map +1 -0
  145. package/dist/parsers/_internal.js +122 -0
  146. package/dist/parsers/csv-parser.d.ts +3 -0
  147. package/dist/parsers/csv-parser.d.ts.map +1 -0
  148. package/dist/parsers/csv-parser.js +202 -0
  149. package/dist/parsers/docx-parser.d.ts +3 -0
  150. package/dist/parsers/docx-parser.d.ts.map +1 -0
  151. package/dist/parsers/docx-parser.js +390 -0
  152. package/dist/parsers/html-parser.d.ts +3 -0
  153. package/dist/parsers/html-parser.d.ts.map +1 -0
  154. package/dist/parsers/html-parser.js +310 -0
  155. package/dist/parsers/index.d.ts +15 -0
  156. package/dist/parsers/index.d.ts.map +1 -0
  157. package/dist/parsers/index.js +41 -0
  158. package/dist/parsers/json-parser.d.ts +3 -0
  159. package/dist/parsers/json-parser.d.ts.map +1 -0
  160. package/dist/parsers/json-parser.js +192 -0
  161. package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
  162. package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
  163. package/dist/parsers/large-document/capability-discovery.js +76 -0
  164. package/dist/parsers/large-document/diagnostics.d.ts +3 -0
  165. package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
  166. package/dist/parsers/large-document/diagnostics.js +11 -0
  167. package/dist/parsers/large-document/index.d.ts +15 -0
  168. package/dist/parsers/large-document/index.d.ts.map +1 -0
  169. package/dist/parsers/large-document/index.js +10 -0
  170. package/dist/parsers/large-document/legacy-format.d.ts +5 -0
  171. package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
  172. package/dist/parsers/large-document/legacy-format.js +25 -0
  173. package/dist/parsers/large-document/preflight.d.ts +9 -0
  174. package/dist/parsers/large-document/preflight.d.ts.map +1 -0
  175. package/dist/parsers/large-document/preflight.js +43 -0
  176. package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
  177. package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
  178. package/dist/parsers/large-document/progressive-extraction.js +123 -0
  179. package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
  180. package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
  181. package/dist/parsers/large-document/progressive-pdf.js +145 -0
  182. package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
  183. package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
  184. package/dist/parsers/large-document/synthetic-source.js +101 -0
  185. package/dist/parsers/large-document/window-builder.d.ts +24 -0
  186. package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
  187. package/dist/parsers/large-document/window-builder.js +75 -0
  188. package/dist/parsers/ocr/index.d.ts +4 -0
  189. package/dist/parsers/ocr/index.d.ts.map +1 -0
  190. package/dist/parsers/ocr/index.js +4 -0
  191. package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
  192. package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
  193. package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
  194. package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
  195. package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
  196. package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
  197. package/dist/parsers/ocr/types.d.ts +16 -0
  198. package/dist/parsers/ocr/types.d.ts.map +1 -0
  199. package/dist/parsers/ocr/types.js +4 -0
  200. package/dist/parsers/parser-test-fixtures.d.ts +28 -0
  201. package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
  202. package/dist/parsers/parser-test-fixtures.js +139 -0
  203. package/dist/parsers/pdf-parser.d.ts +43 -0
  204. package/dist/parsers/pdf-parser.d.ts.map +1 -0
  205. package/dist/parsers/pdf-parser.js +388 -0
  206. package/dist/parsers/registry.d.ts +8 -0
  207. package/dist/parsers/registry.d.ts.map +1 -0
  208. package/dist/parsers/registry.js +57 -0
  209. package/dist/parsers/text-parser.d.ts +3 -0
  210. package/dist/parsers/text-parser.d.ts.map +1 -0
  211. package/dist/parsers/text-parser.js +214 -0
  212. package/dist/parsers/types.d.ts +53 -0
  213. package/dist/parsers/types.d.ts.map +1 -0
  214. package/dist/parsers/types.js +21 -0
  215. package/dist/parsers/unsupported-parser.d.ts +4 -0
  216. package/dist/parsers/unsupported-parser.d.ts.map +1 -0
  217. package/dist/parsers/unsupported-parser.js +97 -0
  218. package/dist/parsers/xlsx-parser.d.ts +3 -0
  219. package/dist/parsers/xlsx-parser.d.ts.map +1 -0
  220. package/dist/parsers/xlsx-parser.js +425 -0
  221. package/dist/privacy/audit-emitter.d.ts +5 -0
  222. package/dist/privacy/audit-emitter.d.ts.map +1 -0
  223. package/dist/privacy/audit-emitter.js +93 -0
  224. package/dist/privacy/diagnostic-redactor.d.ts +2 -0
  225. package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
  226. package/dist/privacy/diagnostic-redactor.js +153 -0
  227. package/dist/privacy/index.d.ts +5 -0
  228. package/dist/privacy/index.d.ts.map +1 -0
  229. package/dist/privacy/index.js +6 -0
  230. package/dist/privacy/retention-applier.d.ts +5 -0
  231. package/dist/privacy/retention-applier.d.ts.map +1 -0
  232. package/dist/privacy/retention-applier.js +88 -0
  233. package/dist/privacy/types.d.ts +98 -0
  234. package/dist/privacy/types.d.ts.map +1 -0
  235. package/dist/privacy/types.js +12 -0
  236. package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
  237. package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
  238. package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
  239. package/dist/qualityIntelligence/index.d.ts +3 -0
  240. package/dist/qualityIntelligence/index.d.ts.map +1 -0
  241. package/dist/qualityIntelligence/index.js +5 -0
  242. package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
  243. package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
  244. package/dist/qualityIntelligence/qiHandoff.js +82 -0
  245. package/dist/retrieval/answer-grounding.d.ts +9 -0
  246. package/dist/retrieval/answer-grounding.d.ts.map +1 -0
  247. package/dist/retrieval/answer-grounding.js +31 -0
  248. package/dist/retrieval/context-pack-assembler.d.ts +24 -0
  249. package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
  250. package/dist/retrieval/context-pack-assembler.js +50 -0
  251. package/dist/retrieval/index.d.ts +6 -0
  252. package/dist/retrieval/index.d.ts.map +1 -0
  253. package/dist/retrieval/index.js +9 -0
  254. package/dist/retrieval/retrieval-runner.d.ts +10 -0
  255. package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
  256. package/dist/retrieval/retrieval-runner.js +163 -0
  257. package/dist/retrieval/scoped-vector-search.d.ts +24 -0
  258. package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
  259. package/dist/retrieval/scoped-vector-search.js +864 -0
  260. package/dist/retrieval/types.d.ts +28 -0
  261. package/dist/retrieval/types.d.ts.map +1 -0
  262. package/dist/retrieval/types.js +33 -0
  263. package/dist/section-path-hash.d.ts +3 -0
  264. package/dist/section-path-hash.d.ts.map +1 -0
  265. package/dist/section-path-hash.js +9 -0
  266. package/dist/source-lifecycle.d.ts +14 -0
  267. package/dist/source-lifecycle.d.ts.map +1 -0
  268. package/dist/source-lifecycle.js +155 -0
  269. package/dist/source-routing-validation.d.ts +11 -0
  270. package/dist/source-routing-validation.d.ts.map +1 -0
  271. package/dist/source-routing-validation.js +140 -0
  272. package/dist/store-content-cipher.d.ts +11 -0
  273. package/dist/store-content-cipher.d.ts.map +1 -0
  274. package/dist/store-content-cipher.js +67 -0
  275. package/dist/store-content-encryption.d.ts +12 -0
  276. package/dist/store-content-encryption.d.ts.map +1 -0
  277. package/dist/store-content-encryption.js +275 -0
  278. package/dist/store-paths.d.ts +6 -0
  279. package/dist/store-paths.d.ts.map +1 -0
  280. package/dist/store-paths.js +61 -0
  281. package/dist/store.d.ts +30 -0
  282. package/dist/store.d.ts.map +1 -0
  283. package/dist/store.js +219 -0
  284. package/dist/testing.d.ts +47 -0
  285. package/dist/testing.d.ts.map +1 -0
  286. package/dist/testing.js +170 -0
  287. package/dist/version.d.ts +2 -0
  288. package/dist/version.d.ts.map +1 -0
  289. package/dist/version.js +4 -0
  290. package/package.json +43 -0
@@ -0,0 +1,218 @@
1
+ // Per-document chunker orchestrator (Epic #189, Issue #195).
2
+ //
3
+ // Reads parsed_units for the document, runs the pure `chunkParsedUnit` per unit, and
4
+ // persists chunks inside a single transaction so a mid-document failure (or AbortSignal
5
+ // cancellation) rolls back ALL chunks for the document — never half-chunked state.
6
+ //
7
+ // Idempotency: with `force: false` (default) and existing chunks already in the table,
8
+ // the runner is a no-op and returns `skippedExisting: true`. With `force: true`, prior
9
+ // chunks are deleted at the start of the transaction.
10
+ import { chunkParsedUnit, chunkingStrategyKey, resolveChunkingOptions } from "./chunker.js";
11
+ import { countChunksForDocument, deleteChunksForDocument, hasStaleChunksForDocument, insertChunkRow, selectDocumentSourceId, selectParsedUnitsForDocument, } from "./chunker-persist.js";
12
+ import { ChunkingError } from "./types.js";
13
+ // ─── Row → ParsedUnit reconstitution ──────────────────────────────────────────
14
+ // The parsed_units table is the canonical write surface for #194. We re-hydrate the
15
+ // discriminant union here so the pure chunker stays unaware of SQLite. Defensive: any
16
+ // row with a missing field for its kind raises a ChunkingError rather than producing a
17
+ // partially-typed value that crashes the slicer later.
18
+ function expectNumber(value, field, unitId) {
19
+ if (value === null) {
20
+ throw new ChunkingError(`parsed_unit ${unitId} is missing required field ${field}`);
21
+ }
22
+ return value;
23
+ }
24
+ function parseStringArrayField(raw, field, unitId, cipher) {
25
+ if (raw === null) {
26
+ throw new ChunkingError(`parsed_unit ${unitId} is missing required field ${field}`);
27
+ }
28
+ const parsed = JSON.parse(cipher.openText(raw));
29
+ if (!Array.isArray(parsed) || !parsed.every((entry) => typeof entry === "string")) {
30
+ throw new ChunkingError(`parsed_unit ${unitId} field ${field} did not deserialise to string[]`);
31
+ }
32
+ return parsed;
33
+ }
34
+ function rowToPageUnit(row, documentId) {
35
+ return {
36
+ kind: "page",
37
+ documentId,
38
+ pageNumber: expectNumber(row.page_number, "page_number", row.id),
39
+ ...(row.page_label !== null ? { pageLabel: row.page_label } : {}),
40
+ characterStart: expectNumber(row.character_start, "character_start", row.id),
41
+ characterEnd: expectNumber(row.character_end, "character_end", row.id),
42
+ };
43
+ }
44
+ function rowToSectionUnit(row, documentId, cipher) {
45
+ return {
46
+ kind: "section",
47
+ documentId,
48
+ sectionPath: parseStringArrayField(row.section_path_json, "section_path_json", row.id, cipher),
49
+ characterStart: expectNumber(row.character_start, "character_start", row.id),
50
+ characterEnd: expectNumber(row.character_end, "character_end", row.id),
51
+ };
52
+ }
53
+ function rowToJsonPathUnit(row, documentId) {
54
+ if (row.json_pointer === null) {
55
+ throw new ChunkingError(`parsed_unit ${row.id} missing json_pointer`);
56
+ }
57
+ return {
58
+ kind: "json-path",
59
+ documentId,
60
+ jsonPointer: row.json_pointer,
61
+ characterStart: expectNumber(row.character_start, "character_start", row.id),
62
+ characterEnd: expectNumber(row.character_end, "character_end", row.id),
63
+ };
64
+ }
65
+ function rowToCsvRowUnit(row, documentId) {
66
+ if (row.table_name === null) {
67
+ throw new ChunkingError(`parsed_unit ${row.id} missing table_name`);
68
+ }
69
+ return {
70
+ kind: "csv-row",
71
+ documentId,
72
+ tableName: row.table_name,
73
+ rowIndex: expectNumber(row.row_index, "row_index", row.id),
74
+ characterStart: expectNumber(row.character_start, "character_start", row.id),
75
+ characterEnd: expectNumber(row.character_end, "character_end", row.id),
76
+ };
77
+ }
78
+ function rowToHtmlBlockUnit(row, documentId, cipher) {
79
+ const heading = row.heading_path_json === null
80
+ ? undefined
81
+ : parseStringArrayField(row.heading_path_json, "heading_path_json", row.id, cipher);
82
+ return {
83
+ kind: "html-block",
84
+ documentId,
85
+ ...(heading !== undefined ? { headingPath: heading } : {}),
86
+ characterStart: expectNumber(row.character_start, "character_start", row.id),
87
+ characterEnd: expectNumber(row.character_end, "character_end", row.id),
88
+ };
89
+ }
90
+ export function rowToParsedUnit(row, documentId, cipher) {
91
+ switch (row.kind) {
92
+ case "page":
93
+ return rowToPageUnit(row, documentId);
94
+ case "section":
95
+ return rowToSectionUnit(row, documentId, cipher);
96
+ case "json-path":
97
+ return rowToJsonPathUnit(row, documentId);
98
+ case "csv-row":
99
+ return rowToCsvRowUnit(row, documentId);
100
+ case "html-block":
101
+ return rowToHtmlBlockUnit(row, documentId, cipher);
102
+ case "unsupported-media":
103
+ return {
104
+ kind: "unsupported-media",
105
+ documentId,
106
+ reason: row.unsupported_reason ?? "unknown",
107
+ };
108
+ default:
109
+ throw new ChunkingError(`parsed_unit ${row.id} has unknown kind ${row.kind}`);
110
+ }
111
+ }
112
+ // ─── Cancellation helper ─────────────────────────────────────────────────────
113
+ function throwIfAborted(signal) {
114
+ if (signal?.aborted === true) {
115
+ throw new ChunkingError("chunkDocument aborted via AbortSignal");
116
+ }
117
+ }
118
+ // ─── ID composition ──────────────────────────────────────────────────────────
119
+ // Chunk IDs are deterministic on (documentId, parsedUnitRowId, orderIndex). Using a
120
+ // composite scheme — rather than UUIDs — keeps the chunks table re-runnable: a
121
+ // re-chunk with force=true reproduces byte-identical row IDs, which makes the audit /
122
+ // evidence-manifest layer's row-equality assertions hold across runs.
123
+ export function composeChunkId(documentId, parsedUnitRowId, orderIndex) {
124
+ return `${String(documentId)}#${parsedUnitRowId}#c${String(orderIndex)}`;
125
+ }
126
+ function documentMaxChunks(options) {
127
+ return resolveChunkingOptions(options).maxChunks;
128
+ }
129
+ function optionsWithRemainingChunkBudget(options, remaining) {
130
+ return options === undefined ? { maxChunks: remaining } : { ...options, maxChunks: remaining };
131
+ }
132
+ function persistAllChunks(store, ctx, rows, options, signal) {
133
+ const db = store._internal.db;
134
+ const chunkIds = [];
135
+ const maxChunks = documentMaxChunks(options);
136
+ const strategyKey = chunkingStrategyKey(options);
137
+ let orderIndex = 0;
138
+ for (const row of rows) {
139
+ throwIfAborted(signal);
140
+ const remaining = maxChunks - chunkIds.length;
141
+ if (remaining <= 0) {
142
+ throw new ChunkingError(`chunkDocument exceeded maxChunks ${String(maxChunks)}`);
143
+ }
144
+ const unit = rowToParsedUnit(row, ctx.documentId, store._internal.contentCipher);
145
+ const chunks = chunkParsedUnit(unit, ctx.sourceText, optionsWithRemainingChunkBudget(options, remaining));
146
+ for (const chunk of chunks) {
147
+ const id = composeChunkId(ctx.documentId, row.id, orderIndex);
148
+ insertChunkRow(db, {
149
+ id,
150
+ capsuleId: ctx.capsuleId,
151
+ sourceId: ctx.sourceId,
152
+ documentId: ctx.documentId,
153
+ parsedUnitId: row.id,
154
+ orderIndex,
155
+ tokenCount: chunk.tokenCount,
156
+ safeExcerptHash: chunk.safeExcerptHash,
157
+ chunkingStrategyVersion: strategyKey,
158
+ characterStart: chunk.characterStart,
159
+ characterEnd: chunk.characterEnd,
160
+ });
161
+ chunkIds.push(id);
162
+ orderIndex += 1;
163
+ }
164
+ }
165
+ return chunkIds;
166
+ }
167
+ function loadChunkingPreflight(store, capsuleId, documentId, options) {
168
+ const db = store._internal.db;
169
+ const existingCount = countChunksForDocument(db, capsuleId, documentId);
170
+ return {
171
+ existingCount,
172
+ staleChunks: existingCount > 0 &&
173
+ hasStaleChunksForDocument(db, capsuleId, documentId, chunkingStrategyKey(options)),
174
+ };
175
+ }
176
+ function assertDocumentSourceMatches(store, capsuleId, documentId, sourceId) {
177
+ const documentSourceId = selectDocumentSourceId(store._internal.db, capsuleId, documentId);
178
+ if (documentSourceId !== undefined && String(documentSourceId) !== String(sourceId)) {
179
+ throw new ChunkingError(`chunkDocument sourceId ${String(sourceId)} does not match document ${String(documentId)} source ${String(documentSourceId)}`);
180
+ }
181
+ }
182
+ function shouldReuseExistingChunks(preflight, force) {
183
+ return preflight.existingCount > 0 && force !== true && !preflight.staleChunks;
184
+ }
185
+ function shouldDeleteExistingChunks(preflight, force) {
186
+ return (force === true || preflight.staleChunks) && preflight.existingCount > 0;
187
+ }
188
+ export function chunkDocument(store, params, options) {
189
+ const { capsuleId, sourceId, documentId, sourceText, force, signal } = params;
190
+ throwIfAborted(signal);
191
+ const db = store._internal.db;
192
+ const preflight = loadChunkingPreflight(store, capsuleId, documentId, options);
193
+ assertDocumentSourceMatches(store, capsuleId, documentId, sourceId);
194
+ if (shouldReuseExistingChunks(preflight, force)) {
195
+ return { capsuleId, documentId, chunkIds: [], skippedExisting: true };
196
+ }
197
+ const rows = selectParsedUnitsForDocument(db, capsuleId, documentId);
198
+ if (rows.length === 0) {
199
+ return { capsuleId, documentId, chunkIds: [], skippedExisting: false };
200
+ }
201
+ db.exec("BEGIN");
202
+ try {
203
+ if (shouldDeleteExistingChunks(preflight, force)) {
204
+ deleteChunksForDocument(db, capsuleId, documentId);
205
+ }
206
+ const ctx = { capsuleId, sourceId, documentId, sourceText };
207
+ const chunkIds = persistAllChunks(store, ctx, rows, options, signal);
208
+ throwIfAborted(signal);
209
+ db.exec("COMMIT");
210
+ return { capsuleId, documentId, chunkIds, skippedExisting: false };
211
+ }
212
+ catch (cause) {
213
+ db.exec("ROLLBACK");
214
+ if (cause instanceof ChunkingError)
215
+ throw cause;
216
+ throw new ChunkingError(`chunkDocument failed for document ${String(documentId)}`, cause === undefined ? undefined : { cause });
217
+ }
218
+ }
@@ -0,0 +1,7 @@
1
+ import type { ParsedUnit } from "@oscharko-dev/keiko-contracts";
2
+ import type { ChunkingOptions, ChunkingResult, ResolvedChunkingOptions } from "./types.js";
3
+ export declare function resolveChunkingOptions(options: ChunkingOptions | undefined): ResolvedChunkingOptions;
4
+ export declare function chunkingStrategyKey(options: ChunkingOptions | undefined): string;
5
+ export declare function chunkDedupeKey(text: string): string | undefined;
6
+ export declare function chunkParsedUnit(unit: ParsedUnit, sourceText: string, options?: ChunkingOptions): readonly ChunkingResult[];
7
+ //# sourceMappingURL=chunker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../../src/chunking/chunker.ts"],"names":[],"mappings":"AAuBA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,+BAA+B,CAAC;AAGhE,OAAO,KAAK,EACV,eAAe,EACf,cAAc,EACd,uBAAuB,EAExB,MAAM,YAAY,CAAC;AAiCpB,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,eAAe,GAAG,SAAS,GACnC,uBAAuB,CAgBzB;AAED,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,eAAe,GAAG,SAAS,GAAG,MAAM,CAWhF;AAuBD,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAK/D;AAuDD,wBAAgB,eAAe,CAC7B,IAAI,EAAE,UAAU,EAChB,UAAU,EAAE,MAAM,EAClB,OAAO,CAAC,EAAE,eAAe,GACxB,SAAS,cAAc,EAAE,CAyB3B"}
@@ -0,0 +1,139 @@
1
+ // Pure chunking function (Epic #189, Issue #195).
2
+ //
3
+ // Given a parsed unit + the document's full source text + chunking options, produce one
4
+ // or more `ChunkingResult` slices. The function is deliberately pure (no IO, no clock, no
5
+ // hashing of external state) so it can be unit-tested without a SQLite store.
6
+ //
7
+ // Algorithm:
8
+ // 1. Resolve the unit's character span. For unit kinds that carry `characterStart/end`
9
+ // (page/section/json-path/csv-row/html-block), slice the source text by those
10
+ // offsets. For unsupported-media units (no offsets), emit nothing — these units
11
+ // are tracked for diagnostics, not for retrieval.
12
+ // 2. If the slice's estimated tokens < minTokens, emit a single chunk over the entire
13
+ // slice — never drop content.
14
+ // 3. Otherwise walk forward by `maxChars - overlapChars` per step, emitting chunks of
15
+ // length `maxChars`. The last chunk includes whatever trailing text remains, even
16
+ // if it is shorter than minTokens — never drop content.
17
+ // 4. Hostile fallback: when no whitespace appears inside the maxChars window (single
18
+ // very long line), the algorithm still produces chunks because slicing is purely
19
+ // character-bounded. Token boundaries become advisory, not authoritative; that
20
+ // tradeoff is intentional and documented.
21
+ import { createHash } from "node:crypto";
22
+ import { charsForTokenBudget } from "./token-estimator.js";
23
+ import { ChunkingError, DEFAULT_CHUNKING_STRATEGY_KEY, DEFAULT_MAX_CHUNKS, DEFAULT_MAX_TOKENS, DEFAULT_MIN_TOKENS, DEFAULT_OVERLAP_TOKENS, MAX_CHUNK_TOKENS, MAX_OVERLAP_TOKENS, CHUNKING_STRATEGY_VERSION, } from "./types.js";
24
+ import { defaultTokenEstimator } from "./token-estimator.js";
25
+ const WHITESPACE_PATTERN = /\s+/gu;
26
+ const INFORMATIVE_CHARACTER_PATTERN = /[\p{L}\p{N}]/u;
27
+ function positiveInteger(raw, fallback, field) {
28
+ const value = raw ?? fallback;
29
+ if (!Number.isFinite(value) || value < 1) {
30
+ throw new ChunkingError(`${field} must be a positive finite integer`);
31
+ }
32
+ return Math.floor(value);
33
+ }
34
+ function nonNegativeInteger(raw, fallback, field) {
35
+ const value = raw ?? fallback;
36
+ if (!Number.isFinite(value) || value < 0) {
37
+ throw new ChunkingError(`${field} must be a non-negative finite integer`);
38
+ }
39
+ return Math.floor(value);
40
+ }
41
+ export function resolveChunkingOptions(options) {
42
+ const maxTokens = Math.min(positiveInteger(options?.maxTokens, DEFAULT_MAX_TOKENS, "maxTokens"), MAX_CHUNK_TOKENS);
43
+ const minTokens = nonNegativeInteger(options?.minTokens, DEFAULT_MIN_TOKENS, "minTokens");
44
+ const overlapTokens = Math.min(nonNegativeInteger(options?.overlapTokens, DEFAULT_OVERLAP_TOKENS, "overlapTokens"), MAX_OVERLAP_TOKENS);
45
+ const maxChunks = Math.min(positiveInteger(options?.maxChunks, DEFAULT_MAX_CHUNKS, "maxChunks"), DEFAULT_MAX_CHUNKS);
46
+ const tokenEstimator = options?.tokenEstimator ?? defaultTokenEstimator;
47
+ return { maxTokens, minTokens, overlapTokens, maxChunks, tokenEstimator };
48
+ }
49
+ export function chunkingStrategyKey(options) {
50
+ if (options === undefined)
51
+ return DEFAULT_CHUNKING_STRATEGY_KEY;
52
+ const resolved = resolveChunkingOptions(options);
53
+ return [
54
+ CHUNKING_STRATEGY_VERSION,
55
+ `max=${String(resolved.maxTokens)}`,
56
+ `min=${String(resolved.minTokens)}`,
57
+ `overlap=${String(resolved.overlapTokens)}`,
58
+ `limit=${String(resolved.maxChunks)}`,
59
+ options.tokenEstimator === undefined ? "estimator=default" : "estimator=custom",
60
+ ].join("|");
61
+ }
62
+ function spanForUnit(unit, sourceLength) {
63
+ if (unit.kind === "unsupported-media")
64
+ return undefined;
65
+ const start = Math.max(0, Math.min(unit.characterStart, sourceLength));
66
+ const end = Math.max(start, Math.min(unit.characterEnd, sourceLength));
67
+ if (end <= start)
68
+ return undefined;
69
+ return { start, end };
70
+ }
71
+ function hashExcerpt(text) {
72
+ return createHash("sha256").update(text, "utf8").digest("hex");
73
+ }
74
+ function normaliseChunkText(text) {
75
+ return text.normalize("NFKC").replace(WHITESPACE_PATTERN, " ").trim();
76
+ }
77
+ export function chunkDedupeKey(text) {
78
+ const normalised = normaliseChunkText(text);
79
+ if (normalised.length === 0)
80
+ return undefined;
81
+ if (!INFORMATIVE_CHARACTER_PATTERN.test(normalised))
82
+ return undefined;
83
+ return hashExcerpt(normalised);
84
+ }
85
+ function buildChunk(sourceText, start, end, estimator) {
86
+ const excerpt = sourceText.slice(start, end);
87
+ if (chunkDedupeKey(excerpt) === undefined)
88
+ return undefined;
89
+ return {
90
+ characterStart: start,
91
+ characterEnd: end,
92
+ tokenCount: estimator(excerpt),
93
+ safeExcerptHash: hashExcerpt(excerpt),
94
+ };
95
+ }
96
+ function computeStepSizes(resolved) {
97
+ const maxChars = Math.max(1, charsForTokenBudget(resolved.maxTokens));
98
+ // Clamp overlap to [0, maxChars-1] so stride is always at least 1 — otherwise an
99
+ // overlap >= maxChars would produce an infinite loop.
100
+ const overlapChars = Math.max(0, Math.min(charsForTokenBudget(resolved.overlapTokens), maxChars - 1));
101
+ const stride = maxChars - overlapChars;
102
+ return { maxChars, overlapChars, stride };
103
+ }
104
+ function shouldEmitSingleChunk(excerpt, resolved) {
105
+ // The unit fits in one chunk when its estimated token count does not exceed maxTokens.
106
+ // The `minTokens` lower bound is a *floor* on chunk size, not a gate — a tiny unit still
107
+ // produces one chunk so we never drop content (spec edge case: "Single tiny unit").
108
+ return resolved.tokenEstimator(excerpt) <= resolved.maxTokens;
109
+ }
110
+ function pushChunk(chunks, chunk, maxChunks) {
111
+ if (chunk === undefined)
112
+ return;
113
+ if (chunks.length >= maxChunks) {
114
+ throw new ChunkingError(`chunkParsedUnit exceeded maxChunks ${String(maxChunks)}`);
115
+ }
116
+ chunks.push(chunk);
117
+ }
118
+ export function chunkParsedUnit(unit, sourceText, options) {
119
+ const resolved = resolveChunkingOptions(options);
120
+ const span = spanForUnit(unit, sourceText.length);
121
+ if (span === undefined)
122
+ return [];
123
+ const excerpt = sourceText.slice(span.start, span.end);
124
+ if (shouldEmitSingleChunk(excerpt, resolved)) {
125
+ const chunk = buildChunk(sourceText, span.start, span.end, resolved.tokenEstimator);
126
+ return chunk === undefined ? [] : [chunk];
127
+ }
128
+ const { maxChars, stride } = computeStepSizes(resolved);
129
+ const chunks = [];
130
+ let cursor = span.start;
131
+ while (cursor < span.end) {
132
+ const end = Math.min(cursor + maxChars, span.end);
133
+ pushChunk(chunks, buildChunk(sourceText, cursor, end, resolved.tokenEstimator), resolved.maxChunks);
134
+ if (end >= span.end)
135
+ break;
136
+ cursor += stride;
137
+ }
138
+ return chunks;
139
+ }
@@ -0,0 +1,4 @@
1
+ import type { CapsuleSetId, ChunkId, CitationReference, KnowledgeCapsuleId } from "@oscharko-dev/keiko-contracts";
2
+ import type { KnowledgeStore } from "../store.js";
3
+ export declare function mapChunkToCitation(store: KnowledgeStore, capsuleId: KnowledgeCapsuleId, chunkId: ChunkId, _capsuleSetId?: CapsuleSetId): CitationReference | null;
4
+ //# sourceMappingURL=citation-mapper.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"citation-mapper.d.ts","sourceRoot":"","sources":["../../src/chunking/citation-mapper.ts"],"names":[],"mappings":"AAuBA,OAAO,KAAK,EACV,YAAY,EACZ,OAAO,EACP,iBAAiB,EAEjB,kBAAkB,EAEnB,MAAM,+BAA+B,CAAC;AAGvC,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAiPlD,wBAAgB,kBAAkB,CAChC,KAAK,EAAE,cAAc,EACrB,SAAS,EAAE,kBAAkB,EAC7B,OAAO,EAAE,OAAO,EAChB,aAAa,CAAC,EAAE,YAAY,GAC3B,iBAAiB,GAAG,IAAI,CAc1B"}
@@ -0,0 +1,180 @@
1
+ // Citation hop (Epic #189, Issue #195).
2
+ //
3
+ // Given a (capsuleId, chunkId), produce a `CitationReference` by walking
4
+ // chunk → parsed_unit → document → page/section. The function is read-only and pure
5
+ // with respect to the store: it never mutates rows.
6
+ //
7
+ // Hop strategy:
8
+ // 1. Look up the chunk row. Returns null when the chunk is absent — distinct from
9
+ // throwing, because retrieval callers (#199) treat missing-chunk as "stale index
10
+ // pointer" and recover by re-running chunking, not by surfacing an error.
11
+ // 2. Look up its parsed_unit row in the same capsule scope.
12
+ // 3. From the parsed_unit's kind, hop:
13
+ // - kind=page: copy pageNumber/pageLabel + characterStart/End directly.
14
+ // - kind=section: copy sectionPath + characterStart/End directly.
15
+ // - kind=html-block: copy headingPath + characterStart/End.
16
+ // - kind=json-path: copy jsonPointer + characterStart/End.
17
+ // - kind=csv-row: copy tableName/rowIndex + characterStart/End.
18
+ // - other kinds (unsupported-media): characterStart/End only.
19
+ // THEN: if the parsed_unit's span overlaps any persisted `pages` row, attach that
20
+ // page's pageNumber/pageLabel — section units inside a paged document still
21
+ // surface a citation page number.
22
+ // 4. Document row provides safeDisplayName + sourceId.
23
+ const SELECT_CHUNK_SQL = "SELECT id, capsule_id, source_id, document_id, parsed_unit_id, character_start, character_end FROM chunks WHERE capsule_id = :c AND id = :id";
24
+ const SELECT_PARSED_UNIT_SQL = [
25
+ "SELECT kind, page_number, page_label, section_path_json,",
26
+ " heading_path_json, json_pointer, table_name, row_index, character_start, character_end",
27
+ "FROM parsed_units",
28
+ "WHERE capsule_id = :c AND id = :id",
29
+ ].join(" ");
30
+ const SELECT_DOCUMENT_SQL = "SELECT source_id, safe_display_name FROM documents WHERE capsule_id = :c AND id = :id";
31
+ // Page-hop query: find a page row that contains the parsed_unit's character span.
32
+ // Used to attach a page number to non-page units (e.g. sections / html-blocks inside a
33
+ // paged document). Limit 1 — citations point at the first containing page.
34
+ const SELECT_PAGE_FOR_RANGE_SQL = [
35
+ "SELECT page_number, page_label FROM pages",
36
+ "WHERE capsule_id = :c AND document_id = :d",
37
+ " AND character_start <= :s AND character_end >= :e",
38
+ "ORDER BY page_number ASC LIMIT 1",
39
+ ].join(" ");
40
+ function fetchChunkRow(db, capsuleId, chunkId) {
41
+ const row = db.prepare(SELECT_CHUNK_SQL).get({ c: capsuleId, id: String(chunkId) });
42
+ return row === undefined ? undefined : row;
43
+ }
44
+ function fetchParsedUnitRow(db, capsuleId, parsedUnitId) {
45
+ const row = db.prepare(SELECT_PARSED_UNIT_SQL).get({ c: capsuleId, id: parsedUnitId });
46
+ return row === undefined ? undefined : row;
47
+ }
48
+ function fetchDocumentRow(db, capsuleId, documentId) {
49
+ const row = db.prepare(SELECT_DOCUMENT_SQL).get({ c: capsuleId, id: String(documentId) });
50
+ return row === undefined ? undefined : row;
51
+ }
52
+ function fetchPageForRange(db, capsuleId, documentId, characterStart, characterEnd) {
53
+ const row = db
54
+ .prepare(SELECT_PAGE_FOR_RANGE_SQL)
55
+ .get({ c: capsuleId, d: String(documentId), s: characterStart, e: characterEnd });
56
+ return row === undefined ? undefined : row;
57
+ }
58
+ function parseStringArray(raw, cipher) {
59
+ if (raw === null)
60
+ return undefined;
61
+ const parsed = JSON.parse(cipher.openText(raw));
62
+ if (!Array.isArray(parsed) || !parsed.every((entry) => typeof entry === "string")) {
63
+ return undefined;
64
+ }
65
+ return parsed;
66
+ }
67
+ function baseHopFields(unit) {
68
+ return {
69
+ pageNumber: undefined,
70
+ pageLabel: undefined,
71
+ sectionPath: undefined,
72
+ jsonPointer: undefined,
73
+ tableName: undefined,
74
+ rowIndex: undefined,
75
+ characterStart: unit.character_start ?? undefined,
76
+ characterEnd: unit.character_end ?? undefined,
77
+ };
78
+ }
79
+ const HOP_FIELDS_BY_KIND = new Map([
80
+ [
81
+ "page",
82
+ (unit, base) => ({
83
+ ...base,
84
+ pageNumber: unit.page_number ?? undefined,
85
+ pageLabel: unit.page_label ?? undefined,
86
+ }),
87
+ ],
88
+ [
89
+ "section",
90
+ (unit, base, cipher) => ({
91
+ ...base,
92
+ sectionPath: parseStringArray(unit.section_path_json, cipher),
93
+ }),
94
+ ],
95
+ [
96
+ "html-block",
97
+ (unit, base, cipher) => ({
98
+ ...base,
99
+ sectionPath: parseStringArray(unit.heading_path_json, cipher),
100
+ }),
101
+ ],
102
+ [
103
+ "json-path",
104
+ (unit, base) => ({
105
+ ...base,
106
+ jsonPointer: unit.json_pointer ?? undefined,
107
+ }),
108
+ ],
109
+ [
110
+ "csv-row",
111
+ (unit, base) => ({
112
+ ...base,
113
+ tableName: unit.table_name ?? undefined,
114
+ rowIndex: unit.row_index ?? undefined,
115
+ }),
116
+ ],
117
+ ]);
118
+ function hopFieldsForUnit(unit, cipher) {
119
+ const base = baseHopFields(unit);
120
+ return HOP_FIELDS_BY_KIND.get(unit.kind)?.(unit, base, cipher) ?? base;
121
+ }
122
+ function applyChunkSpan(hop, chunk) {
123
+ return {
124
+ ...hop,
125
+ characterStart: chunk.character_start ?? hop.characterStart,
126
+ characterEnd: chunk.character_end ?? hop.characterEnd,
127
+ };
128
+ }
129
+ function attachPageHop(db, capsuleId, documentId, hop) {
130
+ if (hop.pageNumber !== undefined)
131
+ return hop;
132
+ if (hop.characterStart === undefined || hop.characterEnd === undefined)
133
+ return hop;
134
+ const page = fetchPageForRange(db, capsuleId, documentId, hop.characterStart, hop.characterEnd);
135
+ if (page === undefined)
136
+ return hop;
137
+ return {
138
+ ...hop,
139
+ pageNumber: page.page_number,
140
+ pageLabel: page.page_label ?? undefined,
141
+ };
142
+ }
143
+ // Builds an `exactOptionalPropertyTypes`-friendly CitationReference: optional fields are
144
+ // only present when defined. Spreading conditional objects keeps tsc happy under that
145
+ // strict option.
146
+ function buildCitation(chunk, document, hop, chunkId, capsuleId) {
147
+ return {
148
+ chunkId,
149
+ capsuleId,
150
+ sourceId: document.source_id,
151
+ documentId: chunk.document_id,
152
+ safeDisplayName: document.safe_display_name,
153
+ ...(hop.pageNumber !== undefined ? { pageNumber: hop.pageNumber } : {}),
154
+ ...(hop.pageLabel !== undefined ? { pageLabel: hop.pageLabel } : {}),
155
+ ...(hop.sectionPath !== undefined ? { sectionPath: hop.sectionPath } : {}),
156
+ ...(hop.jsonPointer !== undefined ? { jsonPointer: hop.jsonPointer } : {}),
157
+ ...(hop.tableName !== undefined ? { tableName: hop.tableName } : {}),
158
+ ...(hop.rowIndex !== undefined ? { rowIndex: hop.rowIndex } : {}),
159
+ ...(hop.characterStart !== undefined ? { characterStart: hop.characterStart } : {}),
160
+ ...(hop.characterEnd !== undefined ? { characterEnd: hop.characterEnd } : {}),
161
+ };
162
+ }
163
+ // `_capsuleSetId` is reserved for the future capsule-set-scoped lookup that retrieval
164
+ // (#199) will need — for now the citation hop is strictly capsule-scoped so we keep
165
+ // the API stable but ignore the parameter. The signature is exported via the barrel.
166
+ export function mapChunkToCitation(store, capsuleId, chunkId, _capsuleSetId) {
167
+ const db = store._internal.db;
168
+ const chunk = fetchChunkRow(db, capsuleId, chunkId);
169
+ if (chunk === undefined)
170
+ return null;
171
+ const unit = fetchParsedUnitRow(db, capsuleId, chunk.parsed_unit_id);
172
+ if (unit === undefined)
173
+ return null;
174
+ const document = fetchDocumentRow(db, capsuleId, chunk.document_id);
175
+ if (document === undefined)
176
+ return null;
177
+ const baseHop = applyChunkSpan(hopFieldsForUnit(unit, store._internal.contentCipher), chunk);
178
+ const hop = attachPageHop(db, capsuleId, chunk.document_id, baseHop);
179
+ return buildCitation(chunk, document, hop, chunkId, capsuleId);
180
+ }
@@ -0,0 +1,6 @@
1
+ export { chunkDedupeKey, chunkParsedUnit, chunkingStrategyKey, resolveChunkingOptions, } from "./chunker.js";
2
+ export { chunkDocument } from "./chunker-runner.js";
3
+ export { mapChunkToCitation } from "./citation-mapper.js";
4
+ export { defaultTokenEstimator, charsForTokenBudget } from "./token-estimator.js";
5
+ export { CHUNKING_STRATEGY_VERSION, ChunkingError, DEFAULT_CHUNKING_STRATEGY_KEY, DEFAULT_MAX_CHUNKS, DEFAULT_MAX_TOKENS, DEFAULT_MIN_TOKENS, DEFAULT_OVERLAP_TOKENS, MAX_CHUNK_TOKENS, MAX_OVERLAP_TOKENS, type ChunkDocumentParams, type ChunkDocumentResult, type ChunkingOptions, type ChunkingResult, type ResolvedChunkingOptions, type TokenEstimator, } from "./types.js";
6
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/chunking/index.ts"],"names":[],"mappings":"AAIA,OAAO,EACL,cAAc,EACd,eAAe,EACf,mBAAmB,EACnB,sBAAsB,GACvB,MAAM,cAAc,CAAC;AACtB,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AAClF,OAAO,EACL,yBAAyB,EACzB,aAAa,EACb,6BAA6B,EAC7B,kBAAkB,EAClB,kBAAkB,EAClB,kBAAkB,EAClB,sBAAsB,EACtB,gBAAgB,EAChB,kBAAkB,EAClB,KAAK,mBAAmB,EACxB,KAAK,mBAAmB,EACxB,KAAK,eAAe,EACpB,KAAK,cAAc,EACnB,KAAK,uBAAuB,EAC5B,KAAK,cAAc,GACpB,MAAM,YAAY,CAAC"}
@@ -0,0 +1,8 @@
1
+ // Barrel for the chunking layer (Epic #189, Issue #195). Composed by the package barrel
2
+ // in ../index.ts; consumers outside the package never import from this subdirectory
3
+ // directly (ADR-0019 direction rule 3e + the trust-8 test-support naming convention).
4
+ export { chunkDedupeKey, chunkParsedUnit, chunkingStrategyKey, resolveChunkingOptions, } from "./chunker.js";
5
+ export { chunkDocument } from "./chunker-runner.js";
6
+ export { mapChunkToCitation } from "./citation-mapper.js";
7
+ export { defaultTokenEstimator, charsForTokenBudget } from "./token-estimator.js";
8
+ export { CHUNKING_STRATEGY_VERSION, ChunkingError, DEFAULT_CHUNKING_STRATEGY_KEY, DEFAULT_MAX_CHUNKS, DEFAULT_MAX_TOKENS, DEFAULT_MIN_TOKENS, DEFAULT_OVERLAP_TOKENS, MAX_CHUNK_TOKENS, MAX_OVERLAP_TOKENS, } from "./types.js";
@@ -0,0 +1,3 @@
1
+ export declare function defaultTokenEstimator(text: string): number;
2
+ export declare function charsForTokenBudget(tokenBudget: number): number;
3
+ //# sourceMappingURL=token-estimator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"token-estimator.d.ts","sourceRoot":"","sources":["../../src/chunking/token-estimator.ts"],"names":[],"mappings":"AAgBA,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAG1D;AAID,wBAAgB,mBAAmB,CAAC,WAAW,EAAE,MAAM,GAAG,MAAM,CAG/D"}
@@ -0,0 +1,26 @@
1
+ // Deterministic fallback token estimator (Epic #189, Issue #195).
2
+ //
3
+ // LIMITATION: this is a crude ~4-chars-per-token heuristic that mirrors what the OpenAI
4
+ // cookbook documents as a rough rule of thumb for English text using cl100k_base. It is
5
+ // NOT a real tokenizer — it over-estimates for CJK / code, under-estimates for languages
6
+ // with long words, and ignores subword boundaries entirely. The point of the seam is so a
7
+ // downstream consumer (#196 indexing orchestrator, #199 retrieval) can inject a real
8
+ // tokenizer (e.g. `js-tiktoken`) without forcing this package to ship the dependency.
9
+ //
10
+ // Why not zero or one-char-per-token? Zero would let `maxTokens` produce empty chunks;
11
+ // one-char-per-token would force absurdly small chunks (every page splits into 400-char
12
+ // fragments). Four matches the order-of-magnitude expectation that callers seeing a
13
+ // `maxTokens: 400` chunk get a ~1.5 KB excerpt rather than a 400-byte one.
14
+ const CHARS_PER_TOKEN = 4;
15
+ export function defaultTokenEstimator(text) {
16
+ if (text.length === 0)
17
+ return 0;
18
+ return Math.ceil(text.length / CHARS_PER_TOKEN);
19
+ }
20
+ // Inverse helper used by the chunker to translate a token budget into a character budget.
21
+ // Kept here so a future tokenizer swap can override it consistently with the estimator.
22
+ export function charsForTokenBudget(tokenBudget) {
23
+ if (tokenBudget <= 0)
24
+ return 0;
25
+ return tokenBudget * CHARS_PER_TOKEN;
26
+ }
@@ -0,0 +1,49 @@
1
+ import type { ChunkId, DocumentId, KnowledgeCapsuleId, KnowledgeSourceId } from "@oscharko-dev/keiko-contracts";
2
+ import { KnowledgeStoreError } from "../errors.js";
3
+ export type TokenEstimator = (text: string) => number;
4
+ export interface ChunkingOptions {
5
+ readonly maxTokens?: number;
6
+ readonly minTokens?: number;
7
+ readonly overlapTokens?: number;
8
+ readonly maxChunks?: number;
9
+ readonly tokenEstimator?: TokenEstimator;
10
+ }
11
+ export declare const DEFAULT_MAX_TOKENS = 400;
12
+ export declare const DEFAULT_MIN_TOKENS = 64;
13
+ export declare const DEFAULT_OVERLAP_TOKENS = 32;
14
+ export declare const DEFAULT_MAX_CHUNKS = 50000;
15
+ export declare const MAX_CHUNK_TOKENS = 2048;
16
+ export declare const MAX_OVERLAP_TOKENS = 1024;
17
+ export declare const CHUNKING_STRATEGY_VERSION: "issue-195-v2";
18
+ export declare const DEFAULT_CHUNKING_STRATEGY_KEY: `issue-195-v2|max=${string}|min=${string}|overlap=${string}|limit=${string}|estimator=default`;
19
+ export interface ResolvedChunkingOptions {
20
+ readonly maxTokens: number;
21
+ readonly minTokens: number;
22
+ readonly overlapTokens: number;
23
+ readonly maxChunks: number;
24
+ readonly tokenEstimator: TokenEstimator;
25
+ }
26
+ export interface ChunkDocumentParams {
27
+ readonly capsuleId: KnowledgeCapsuleId;
28
+ readonly sourceId: KnowledgeSourceId;
29
+ readonly documentId: DocumentId;
30
+ readonly sourceText: string;
31
+ readonly force?: boolean;
32
+ readonly signal?: AbortSignal;
33
+ }
34
+ export interface ChunkDocumentResult {
35
+ readonly capsuleId: KnowledgeCapsuleId;
36
+ readonly documentId: DocumentId;
37
+ readonly chunkIds: readonly ChunkId[];
38
+ readonly skippedExisting: boolean;
39
+ }
40
+ export declare class ChunkingError extends KnowledgeStoreError {
41
+ readonly name: string;
42
+ }
43
+ export interface ChunkingResult {
44
+ readonly characterStart: number;
45
+ readonly characterEnd: number;
46
+ readonly tokenCount: number;
47
+ readonly safeExcerptHash: string;
48
+ }
49
+ //# sourceMappingURL=types.d.ts.map