@oscharko-dev/keiko-local-knowledge 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/dist/.tsbuildinfo +1 -0
  2. package/dist/bounded-document-extraction.d.ts +27 -0
  3. package/dist/bounded-document-extraction.d.ts.map +1 -0
  4. package/dist/bounded-document-extraction.js +214 -0
  5. package/dist/capsule-lifecycle.d.ts +33 -0
  6. package/dist/capsule-lifecycle.d.ts.map +1 -0
  7. package/dist/capsule-lifecycle.js +292 -0
  8. package/dist/capsule-set-lifecycle.d.ts +15 -0
  9. package/dist/capsule-set-lifecycle.d.ts.map +1 -0
  10. package/dist/capsule-set-lifecycle.js +158 -0
  11. package/dist/chunking/chunker-persist.d.ts +36 -0
  12. package/dist/chunking/chunker-persist.d.ts.map +1 -0
  13. package/dist/chunking/chunker-persist.js +74 -0
  14. package/dist/chunking/chunker-runner.d.ts +9 -0
  15. package/dist/chunking/chunker-runner.d.ts.map +1 -0
  16. package/dist/chunking/chunker-runner.js +218 -0
  17. package/dist/chunking/chunker.d.ts +7 -0
  18. package/dist/chunking/chunker.d.ts.map +1 -0
  19. package/dist/chunking/chunker.js +139 -0
  20. package/dist/chunking/citation-mapper.d.ts +4 -0
  21. package/dist/chunking/citation-mapper.d.ts.map +1 -0
  22. package/dist/chunking/citation-mapper.js +180 -0
  23. package/dist/chunking/index.d.ts +6 -0
  24. package/dist/chunking/index.d.ts.map +1 -0
  25. package/dist/chunking/index.js +8 -0
  26. package/dist/chunking/token-estimator.d.ts +3 -0
  27. package/dist/chunking/token-estimator.d.ts.map +1 -0
  28. package/dist/chunking/token-estimator.js +26 -0
  29. package/dist/chunking/types.d.ts +49 -0
  30. package/dist/chunking/types.d.ts.map +1 -0
  31. package/dist/chunking/types.js +26 -0
  32. package/dist/composition.d.ts +57 -0
  33. package/dist/composition.d.ts.map +1 -0
  34. package/dist/composition.js +310 -0
  35. package/dist/conversation/citation-attacher.d.ts +8 -0
  36. package/dist/conversation/citation-attacher.d.ts.map +1 -0
  37. package/dist/conversation/citation-attacher.js +55 -0
  38. package/dist/conversation/citation-excerpts.d.ts +4 -0
  39. package/dist/conversation/citation-excerpts.d.ts.map +1 -0
  40. package/dist/conversation/citation-excerpts.js +41 -0
  41. package/dist/conversation/grounded-answer-runner.d.ts +9 -0
  42. package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
  43. package/dist/conversation/grounded-answer-runner.js +61 -0
  44. package/dist/conversation/index.d.ts +5 -0
  45. package/dist/conversation/index.d.ts.map +1 -0
  46. package/dist/conversation/index.js +7 -0
  47. package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
  48. package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
  49. package/dist/conversation/model-gateway-answer-generator.js +105 -0
  50. package/dist/conversation/types.d.ts +35 -0
  51. package/dist/conversation/types.d.ts.map +1 -0
  52. package/dist/conversation/types.js +24 -0
  53. package/dist/discovery/discovery-runner.d.ts +23 -0
  54. package/dist/discovery/discovery-runner.d.ts.map +1 -0
  55. package/dist/discovery/discovery-runner.js +109 -0
  56. package/dist/discovery/extract-progressive.d.ts +17 -0
  57. package/dist/discovery/extract-progressive.d.ts.map +1 -0
  58. package/dist/discovery/extract-progressive.js +522 -0
  59. package/dist/discovery/extract.d.ts +26 -0
  60. package/dist/discovery/extract.d.ts.map +1 -0
  61. package/dist/discovery/extract.js +906 -0
  62. package/dist/discovery/glob.d.ts +10 -0
  63. package/dist/discovery/glob.d.ts.map +1 -0
  64. package/dist/discovery/glob.js +72 -0
  65. package/dist/discovery/index.d.ts +6 -0
  66. package/dist/discovery/index.d.ts.map +1 -0
  67. package/dist/discovery/index.js +8 -0
  68. package/dist/discovery/media-type.d.ts +4 -0
  69. package/dist/discovery/media-type.d.ts.map +1 -0
  70. package/dist/discovery/media-type.js +62 -0
  71. package/dist/discovery/persist.d.ts +63 -0
  72. package/dist/discovery/persist.d.ts.map +1 -0
  73. package/dist/discovery/persist.js +345 -0
  74. package/dist/discovery/test-support.d.ts +16 -0
  75. package/dist/discovery/test-support.d.ts.map +1 -0
  76. package/dist/discovery/test-support.js +127 -0
  77. package/dist/discovery/types.d.ts +63 -0
  78. package/dist/discovery/types.d.ts.map +1 -0
  79. package/dist/discovery/types.js +28 -0
  80. package/dist/discovery/walk.d.ts +12 -0
  81. package/dist/discovery/walk.d.ts.map +1 -0
  82. package/dist/discovery/walk.js +302 -0
  83. package/dist/errors.d.ts +13 -0
  84. package/dist/errors.d.ts.map +1 -0
  85. package/dist/errors.js +22 -0
  86. package/dist/evaluations/dimensions.d.ts +14 -0
  87. package/dist/evaluations/dimensions.d.ts.map +1 -0
  88. package/dist/evaluations/dimensions.js +191 -0
  89. package/dist/evaluations/fixtures.d.ts +18 -0
  90. package/dist/evaluations/fixtures.d.ts.map +1 -0
  91. package/dist/evaluations/fixtures.js +858 -0
  92. package/dist/evaluations/index.d.ts +7 -0
  93. package/dist/evaluations/index.d.ts.map +1 -0
  94. package/dist/evaluations/index.js +10 -0
  95. package/dist/evaluations/report.d.ts +3 -0
  96. package/dist/evaluations/report.d.ts.map +1 -0
  97. package/dist/evaluations/report.js +31 -0
  98. package/dist/evaluations/runner-seed.d.ts +12 -0
  99. package/dist/evaluations/runner-seed.d.ts.map +1 -0
  100. package/dist/evaluations/runner-seed.js +175 -0
  101. package/dist/evaluations/runner.d.ts +8 -0
  102. package/dist/evaluations/runner.d.ts.map +1 -0
  103. package/dist/evaluations/runner.js +205 -0
  104. package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
  105. package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
  106. package/dist/evaluations/scripted-embedding-adapter.js +163 -0
  107. package/dist/evaluations/types.d.ts +116 -0
  108. package/dist/evaluations/types.d.ts.map +1 -0
  109. package/dist/evaluations/types.js +27 -0
  110. package/dist/index.d.ts +23 -0
  111. package/dist/index.d.ts.map +1 -0
  112. package/dist/index.js +41 -0
  113. package/dist/indexing/bounded-indexing.d.ts +41 -0
  114. package/dist/indexing/bounded-indexing.d.ts.map +1 -0
  115. package/dist/indexing/bounded-indexing.js +240 -0
  116. package/dist/indexing/checkpoint-persist.d.ts +8 -0
  117. package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
  118. package/dist/indexing/checkpoint-persist.js +135 -0
  119. package/dist/indexing/checkpoint-resume.d.ts +20 -0
  120. package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
  121. package/dist/indexing/checkpoint-resume.js +50 -0
  122. package/dist/indexing/embedding-batcher.d.ts +3 -0
  123. package/dist/indexing/embedding-batcher.d.ts.map +1 -0
  124. package/dist/indexing/embedding-batcher.js +390 -0
  125. package/dist/indexing/index.d.ts +7 -0
  126. package/dist/indexing/index.d.ts.map +1 -0
  127. package/dist/indexing/index.js +11 -0
  128. package/dist/indexing/job-persist.d.ts +46 -0
  129. package/dist/indexing/job-persist.d.ts.map +1 -0
  130. package/dist/indexing/job-persist.js +157 -0
  131. package/dist/indexing/job-resume.d.ts +4 -0
  132. package/dist/indexing/job-resume.d.ts.map +1 -0
  133. package/dist/indexing/job-resume.js +14 -0
  134. package/dist/indexing/orchestrator.d.ts +3 -0
  135. package/dist/indexing/orchestrator.d.ts.map +1 -0
  136. package/dist/indexing/orchestrator.js +1151 -0
  137. package/dist/indexing/types.d.ts +156 -0
  138. package/dist/indexing/types.d.ts.map +1 -0
  139. package/dist/indexing/types.js +30 -0
  140. package/dist/indexing/vector-persist.d.ts +32 -0
  141. package/dist/indexing/vector-persist.d.ts.map +1 -0
  142. package/dist/indexing/vector-persist.js +105 -0
  143. package/dist/parsers/_internal.d.ts +20 -0
  144. package/dist/parsers/_internal.d.ts.map +1 -0
  145. package/dist/parsers/_internal.js +122 -0
  146. package/dist/parsers/csv-parser.d.ts +3 -0
  147. package/dist/parsers/csv-parser.d.ts.map +1 -0
  148. package/dist/parsers/csv-parser.js +202 -0
  149. package/dist/parsers/docx-parser.d.ts +3 -0
  150. package/dist/parsers/docx-parser.d.ts.map +1 -0
  151. package/dist/parsers/docx-parser.js +390 -0
  152. package/dist/parsers/html-parser.d.ts +3 -0
  153. package/dist/parsers/html-parser.d.ts.map +1 -0
  154. package/dist/parsers/html-parser.js +310 -0
  155. package/dist/parsers/index.d.ts +15 -0
  156. package/dist/parsers/index.d.ts.map +1 -0
  157. package/dist/parsers/index.js +41 -0
  158. package/dist/parsers/json-parser.d.ts +3 -0
  159. package/dist/parsers/json-parser.d.ts.map +1 -0
  160. package/dist/parsers/json-parser.js +192 -0
  161. package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
  162. package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
  163. package/dist/parsers/large-document/capability-discovery.js +76 -0
  164. package/dist/parsers/large-document/diagnostics.d.ts +3 -0
  165. package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
  166. package/dist/parsers/large-document/diagnostics.js +11 -0
  167. package/dist/parsers/large-document/index.d.ts +15 -0
  168. package/dist/parsers/large-document/index.d.ts.map +1 -0
  169. package/dist/parsers/large-document/index.js +10 -0
  170. package/dist/parsers/large-document/legacy-format.d.ts +5 -0
  171. package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
  172. package/dist/parsers/large-document/legacy-format.js +25 -0
  173. package/dist/parsers/large-document/preflight.d.ts +9 -0
  174. package/dist/parsers/large-document/preflight.d.ts.map +1 -0
  175. package/dist/parsers/large-document/preflight.js +43 -0
  176. package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
  177. package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
  178. package/dist/parsers/large-document/progressive-extraction.js +123 -0
  179. package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
  180. package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
  181. package/dist/parsers/large-document/progressive-pdf.js +145 -0
  182. package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
  183. package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
  184. package/dist/parsers/large-document/synthetic-source.js +101 -0
  185. package/dist/parsers/large-document/window-builder.d.ts +24 -0
  186. package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
  187. package/dist/parsers/large-document/window-builder.js +75 -0
  188. package/dist/parsers/ocr/index.d.ts +4 -0
  189. package/dist/parsers/ocr/index.d.ts.map +1 -0
  190. package/dist/parsers/ocr/index.js +4 -0
  191. package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
  192. package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
  193. package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
  194. package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
  195. package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
  196. package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
  197. package/dist/parsers/ocr/types.d.ts +16 -0
  198. package/dist/parsers/ocr/types.d.ts.map +1 -0
  199. package/dist/parsers/ocr/types.js +4 -0
  200. package/dist/parsers/parser-test-fixtures.d.ts +28 -0
  201. package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
  202. package/dist/parsers/parser-test-fixtures.js +139 -0
  203. package/dist/parsers/pdf-parser.d.ts +43 -0
  204. package/dist/parsers/pdf-parser.d.ts.map +1 -0
  205. package/dist/parsers/pdf-parser.js +388 -0
  206. package/dist/parsers/registry.d.ts +8 -0
  207. package/dist/parsers/registry.d.ts.map +1 -0
  208. package/dist/parsers/registry.js +57 -0
  209. package/dist/parsers/text-parser.d.ts +3 -0
  210. package/dist/parsers/text-parser.d.ts.map +1 -0
  211. package/dist/parsers/text-parser.js +214 -0
  212. package/dist/parsers/types.d.ts +53 -0
  213. package/dist/parsers/types.d.ts.map +1 -0
  214. package/dist/parsers/types.js +21 -0
  215. package/dist/parsers/unsupported-parser.d.ts +4 -0
  216. package/dist/parsers/unsupported-parser.d.ts.map +1 -0
  217. package/dist/parsers/unsupported-parser.js +97 -0
  218. package/dist/parsers/xlsx-parser.d.ts +3 -0
  219. package/dist/parsers/xlsx-parser.d.ts.map +1 -0
  220. package/dist/parsers/xlsx-parser.js +425 -0
  221. package/dist/privacy/audit-emitter.d.ts +5 -0
  222. package/dist/privacy/audit-emitter.d.ts.map +1 -0
  223. package/dist/privacy/audit-emitter.js +93 -0
  224. package/dist/privacy/diagnostic-redactor.d.ts +2 -0
  225. package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
  226. package/dist/privacy/diagnostic-redactor.js +153 -0
  227. package/dist/privacy/index.d.ts +5 -0
  228. package/dist/privacy/index.d.ts.map +1 -0
  229. package/dist/privacy/index.js +6 -0
  230. package/dist/privacy/retention-applier.d.ts +5 -0
  231. package/dist/privacy/retention-applier.d.ts.map +1 -0
  232. package/dist/privacy/retention-applier.js +88 -0
  233. package/dist/privacy/types.d.ts +98 -0
  234. package/dist/privacy/types.d.ts.map +1 -0
  235. package/dist/privacy/types.js +12 -0
  236. package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
  237. package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
  238. package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
  239. package/dist/qualityIntelligence/index.d.ts +3 -0
  240. package/dist/qualityIntelligence/index.d.ts.map +1 -0
  241. package/dist/qualityIntelligence/index.js +5 -0
  242. package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
  243. package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
  244. package/dist/qualityIntelligence/qiHandoff.js +82 -0
  245. package/dist/retrieval/answer-grounding.d.ts +9 -0
  246. package/dist/retrieval/answer-grounding.d.ts.map +1 -0
  247. package/dist/retrieval/answer-grounding.js +31 -0
  248. package/dist/retrieval/context-pack-assembler.d.ts +24 -0
  249. package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
  250. package/dist/retrieval/context-pack-assembler.js +50 -0
  251. package/dist/retrieval/index.d.ts +6 -0
  252. package/dist/retrieval/index.d.ts.map +1 -0
  253. package/dist/retrieval/index.js +9 -0
  254. package/dist/retrieval/retrieval-runner.d.ts +10 -0
  255. package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
  256. package/dist/retrieval/retrieval-runner.js +163 -0
  257. package/dist/retrieval/scoped-vector-search.d.ts +24 -0
  258. package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
  259. package/dist/retrieval/scoped-vector-search.js +864 -0
  260. package/dist/retrieval/types.d.ts +28 -0
  261. package/dist/retrieval/types.d.ts.map +1 -0
  262. package/dist/retrieval/types.js +33 -0
  263. package/dist/section-path-hash.d.ts +3 -0
  264. package/dist/section-path-hash.d.ts.map +1 -0
  265. package/dist/section-path-hash.js +9 -0
  266. package/dist/source-lifecycle.d.ts +14 -0
  267. package/dist/source-lifecycle.d.ts.map +1 -0
  268. package/dist/source-lifecycle.js +155 -0
  269. package/dist/source-routing-validation.d.ts +11 -0
  270. package/dist/source-routing-validation.d.ts.map +1 -0
  271. package/dist/source-routing-validation.js +140 -0
  272. package/dist/store-content-cipher.d.ts +11 -0
  273. package/dist/store-content-cipher.d.ts.map +1 -0
  274. package/dist/store-content-cipher.js +67 -0
  275. package/dist/store-content-encryption.d.ts +12 -0
  276. package/dist/store-content-encryption.d.ts.map +1 -0
  277. package/dist/store-content-encryption.js +275 -0
  278. package/dist/store-paths.d.ts +6 -0
  279. package/dist/store-paths.d.ts.map +1 -0
  280. package/dist/store-paths.js +61 -0
  281. package/dist/store.d.ts +30 -0
  282. package/dist/store.d.ts.map +1 -0
  283. package/dist/store.js +219 -0
  284. package/dist/testing.d.ts +47 -0
  285. package/dist/testing.d.ts.map +1 -0
  286. package/dist/testing.js +170 -0
  287. package/dist/version.d.ts +2 -0
  288. package/dist/version.d.ts.map +1 -0
  289. package/dist/version.js +4 -0
  290. package/package.json +43 -0
@@ -0,0 +1,202 @@
1
+ // CSV / TSV parser adapter (Epic #189, Issue #266). Pure, hand-rolled RFC 4180 tokenizer.
2
+ //
3
+ // No `csv-parse` or other dependency — the rules are small enough that a single state
4
+ // machine over the decoded string handles every case we need:
5
+ //
6
+ // * Quoted fields preserve embedded delimiters (`,` or `\t`) and embedded newlines.
7
+ // * `""` inside a quoted field decodes to a single literal `"`.
8
+ // * CRLF, LF, and bare CR row terminators all work.
9
+ // * A trailing newline does NOT emit a synthetic empty row.
10
+ // * A row with only whitespace + empty fields is preserved verbatim (we do not lose data).
11
+ //
12
+ // Emits one ParsedUnit { kind: "csv-row" } per non-header row. The first row is treated as
13
+ // the header and is NOT emitted as a unit — its values are kept only for the implicit table
14
+ // schema. If the document has no header row (a single line), THAT line is emitted as the
15
+ // header AND a single data row, so a one-line CSV stays observable.
16
+ import { decodeUtf8, diagnostic, emptyResult, oversizeDiagnostic, shouldStop, } from "./_internal.js";
17
+ const PARSER_ID = "csv";
18
+ const PARSER_VERSION = "1";
19
+ const CSV_EXTENSIONS = new Set(["csv"]);
20
+ const TSV_EXTENSIONS = new Set(["tsv", "tab"]);
21
+ function selectDelimiter(input) {
22
+ const ext = input.extension.toLowerCase();
23
+ if (CSV_EXTENSIONS.has(ext))
24
+ return ",";
25
+ if (TSV_EXTENSIONS.has(ext))
26
+ return "\t";
27
+ const media = input.mediaType.toLowerCase();
28
+ if (media === "text/csv")
29
+ return ",";
30
+ if (media === "text/tab-separated-values")
31
+ return "\t";
32
+ return null;
33
+ }
34
+ function readField(state) {
35
+ const { text } = state;
36
+ if (state.cursor >= text.length)
37
+ return { endOfRow: true };
38
+ const code = text.charCodeAt(state.cursor);
39
+ if (code === 0x22 /* " */)
40
+ return readQuotedField(state);
41
+ return readBareField(state);
42
+ }
43
+ function readBareField(state) {
44
+ const { text, delimiter } = state;
45
+ while (state.cursor < text.length) {
46
+ const code = text.charCodeAt(state.cursor);
47
+ if (code === delimiter) {
48
+ state.cursor += 1;
49
+ return { endOfRow: false };
50
+ }
51
+ if (code === 0x0a /* LF */) {
52
+ // Caller advances past LF; we just signal end-of-row here.
53
+ return { endOfRow: true };
54
+ }
55
+ if (code === 0x0d /* CR */) {
56
+ return { endOfRow: true };
57
+ }
58
+ state.cursor += 1;
59
+ }
60
+ return { endOfRow: true };
61
+ }
62
+ function readQuotedField(state) {
63
+ const { text } = state;
64
+ // Skip the opening quote.
65
+ state.cursor += 1;
66
+ while (state.cursor < text.length) {
67
+ const code = text.charCodeAt(state.cursor);
68
+ if (code === 0x22 /* " */) {
69
+ // Escaped quote? Peek ahead.
70
+ if (state.cursor + 1 < text.length && text.charCodeAt(state.cursor + 1) === 0x22) {
71
+ state.cursor += 2;
72
+ continue;
73
+ }
74
+ // Closing quote — consume it then expect delimiter / row terminator / EOF.
75
+ state.cursor += 1;
76
+ return consumeAfterQuote(state);
77
+ }
78
+ state.cursor += 1;
79
+ }
80
+ // Unterminated quoted field: treat the rest of the document as part of this field.
81
+ return { endOfRow: true };
82
+ }
83
+ function consumeAfterQuote(state) {
84
+ if (state.cursor >= state.text.length)
85
+ return { endOfRow: true };
86
+ const code = state.text.charCodeAt(state.cursor);
87
+ if (code === state.delimiter) {
88
+ state.cursor += 1;
89
+ return { endOfRow: false };
90
+ }
91
+ if (code === 0x0a || code === 0x0d)
92
+ return { endOfRow: true };
93
+ // Malformed: bytes after the closing quote that are neither delimiter nor newline. We
94
+ // tolerate by consuming until the next delimiter / newline rather than crashing.
95
+ while (state.cursor < state.text.length) {
96
+ const inner = state.text.charCodeAt(state.cursor);
97
+ if (inner === state.delimiter) {
98
+ state.cursor += 1;
99
+ return { endOfRow: false };
100
+ }
101
+ if (inner === 0x0a || inner === 0x0d)
102
+ return { endOfRow: true };
103
+ state.cursor += 1;
104
+ }
105
+ return { endOfRow: true };
106
+ }
107
+ function consumeRowTerminator(state) {
108
+ if (state.cursor >= state.text.length)
109
+ return;
110
+ const code = state.text.charCodeAt(state.cursor);
111
+ if (code === 0x0d) {
112
+ state.cursor += 1;
113
+ if (state.cursor < state.text.length && state.text.charCodeAt(state.cursor) === 0x0a) {
114
+ state.cursor += 1;
115
+ }
116
+ return;
117
+ }
118
+ if (code === 0x0a)
119
+ state.cursor += 1;
120
+ }
121
+ function readRow(state) {
122
+ if (state.cursor >= state.text.length) {
123
+ return { start: state.cursor, end: state.cursor, fieldCount: 0, done: true };
124
+ }
125
+ const start = state.cursor;
126
+ state.rowStart = start;
127
+ let fieldCount = 0;
128
+ for (;;) {
129
+ const field = readField(state);
130
+ fieldCount += 1;
131
+ if (field.endOfRow)
132
+ break;
133
+ }
134
+ const end = state.cursor;
135
+ consumeRowTerminator(state);
136
+ return { start, end, fieldCount, done: false };
137
+ }
138
+ function emitRows(text, delimiter, input, options) {
139
+ const tableName = input.extension.toLowerCase() === "tsv" ? "tsv" : "csv";
140
+ const state = { text, delimiter: delimiter.charCodeAt(0), cursor: 0, rowStart: 0 };
141
+ const startedAt = options.now();
142
+ const units = [];
143
+ const diagnostics = [];
144
+ // Read the header row first. If there are no further rows, emit the header itself as a
145
+ // single data row at index 0 so a one-line CSV remains observable.
146
+ const header = readRow(state);
147
+ if (header.done)
148
+ return { units, diagnostics };
149
+ if (state.cursor >= text.length) {
150
+ units.push(csvUnit(input, tableName, 0, header.start, header.end));
151
+ return { units, diagnostics };
152
+ }
153
+ let rowIndex = 0;
154
+ while (state.cursor < text.length) {
155
+ const limit = shouldStop(startedAt, options, units.length);
156
+ if (limit.stop && limit.code !== undefined && limit.message !== undefined) {
157
+ diagnostics.push(diagnostic(limit.code, limit.message, input.documentId, "info"));
158
+ break;
159
+ }
160
+ const row = readRow(state);
161
+ if (row.done)
162
+ break;
163
+ units.push(csvUnit(input, tableName, rowIndex, row.start, row.end));
164
+ rowIndex += 1;
165
+ }
166
+ return { units, diagnostics };
167
+ }
168
+ function csvUnit(input, tableName, rowIndex, start, end) {
169
+ return {
170
+ kind: "csv-row",
171
+ documentId: input.documentId,
172
+ tableName,
173
+ rowIndex,
174
+ characterStart: start,
175
+ characterEnd: end,
176
+ };
177
+ }
178
+ export const csvParser = Object.freeze({
179
+ capability: Object.freeze({
180
+ parserId: PARSER_ID,
181
+ parserVersion: PARSER_VERSION,
182
+ matches: (input) => selectDelimiter(input) !== null,
183
+ }),
184
+ parse: (input, options) => {
185
+ if (input.bytes.byteLength > options.maxBytes) {
186
+ return emptyResult(csvParser.capability, input.documentId, options, [
187
+ oversizeDiagnostic(input.documentId, input.bytes.byteLength, options.maxBytes),
188
+ ]);
189
+ }
190
+ const delimiter = selectDelimiter(input);
191
+ if (delimiter === null) {
192
+ // Defensive: registry never routes here without a delimiter, but we honour the
193
+ // contract by returning a typed diagnostic rather than throwing.
194
+ return emptyResult(csvParser.capability, input.documentId, options, [
195
+ diagnostic("UNSUPPORTED_FORMAT", "no delimiter selected", input.documentId, "error"),
196
+ ]);
197
+ }
198
+ const decoded = decodeUtf8(input.bytes);
199
+ const emission = emitRows(decoded.text, delimiter, input, options);
200
+ return emptyResult(csvParser.capability, input.documentId, options, emission.diagnostics, emission.units);
201
+ },
202
+ });
@@ -0,0 +1,3 @@
1
+ import type { AsyncParserAdapter } from "./types.js";
2
+ export declare const docxParser: AsyncParserAdapter;
3
+ //# sourceMappingURL=docx-parser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"docx-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/docx-parser.ts"],"names":[],"mappings":"AAmBA,OAAO,KAAK,EACV,kBAAkB,EAMnB,MAAM,YAAY,CAAC;AAgoBpB,eAAO,MAAM,UAAU,EAAE,kBAKvB,CAAC"}
@@ -0,0 +1,390 @@
1
+ import { Buffer } from "node:buffer";
2
+ import yauzl from "yauzl";
3
+ import { decodeXmlEntities as decodeXmlEntitiesShared, diagnostic, emptyResult, objectLimitDiagnostic, oversizeDiagnostic, parserIdentity, shouldStop, } from "./_internal.js";
4
+ const PARSER_ID = "docx";
5
+ const PARSER_VERSION = "1";
6
+ const DEPENDENCY_VERSIONS = Object.freeze([
7
+ Object.freeze({ packageName: "yauzl", version: "3.4.0" }),
8
+ ]);
9
+ const DOCX_MEDIA = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
10
+ const DOCUMENT_XML_ENTRY = "word/document.xml";
11
+ const MAX_DOCUMENT_XML_INFLATED_BYTES = 16 * 1024 * 1024;
12
+ const MAX_DOCUMENT_XML_INFLATE_RATIO = 100;
13
+ const HEADING_STYLE_PATTERN = /<w:pStyle\b[^>]*w:val="Heading([1-6])"/i;
14
+ const PARAGRAPH_PATTERN = /<w:p\b[\s\S]*?<\/w:p>/gi;
15
+ const TEXT_RUN_PATTERN = /<w:t\b[^>]*>([\s\S]*?)<\/w:t>/gi;
16
+ function isDocx(input) {
17
+ return input.extension.toLowerCase() === "docx" || input.mediaType.toLowerCase() === DOCX_MEDIA;
18
+ }
19
+ function cancelled(capability, input, options) {
20
+ return emptyResult(capability, input.documentId, options, [
21
+ diagnostic("PARSER_CANCELLED", "caller aborted parser", input.documentId, "info"),
22
+ ]);
23
+ }
24
+ function syncFallback(capability) {
25
+ return (input, options) => {
26
+ return emptyResult(capability, input.documentId, options, [
27
+ diagnostic("UNSUPPORTED_FORMAT", "docx parser requires async caller; use parseAsync via discovery", input.documentId, "info"),
28
+ ], [unsupportedMediaUnit(input.documentId, "docx-async-required")]);
29
+ };
30
+ }
31
+ function toError(error, fallback) {
32
+ return error instanceof Error ? error : new Error(fallback);
33
+ }
34
+ function closeZipQuietly(zip) {
35
+ try {
36
+ zip.close();
37
+ }
38
+ catch {
39
+ // Close failures are non-fatal during parser cleanup.
40
+ }
41
+ }
42
+ function openZip(bytes) {
43
+ return new Promise((resolve, reject) => {
44
+ yauzl.fromBuffer(Buffer.from(bytes), { lazyEntries: true, decodeStrings: true }, (error, zip) => {
45
+ if (error !== null) {
46
+ reject(toError(error, "failed to open docx zip"));
47
+ return;
48
+ }
49
+ resolve(zip);
50
+ });
51
+ });
52
+ }
53
+ function maxInflatedEntryBytes(maxInputBytes) {
54
+ const inputCap = Math.max(1, Math.floor(maxInputBytes));
55
+ return Math.min(MAX_DOCUMENT_XML_INFLATED_BYTES, inputCap * 10);
56
+ }
57
+ function assertEntryWithinLimits(entry, limits) {
58
+ if (entry.uncompressedSize > limits.maxInflatedEntryBytes) {
59
+ throw new Error(`docx document.xml inflated size ${String(entry.uncompressedSize)} exceeds limit ${String(limits.maxInflatedEntryBytes)}`);
60
+ }
61
+ if (entry.compressedSize > 0 &&
62
+ entry.uncompressedSize / entry.compressedSize > limits.maxInflateRatio) {
63
+ throw new Error("docx document.xml compression ratio exceeds limit");
64
+ }
65
+ }
66
+ function destroyStream(readStream, error) {
67
+ const destroy = readStream.destroy;
68
+ if (typeof destroy === "function") {
69
+ destroy.call(readStream, error);
70
+ }
71
+ }
72
+ function readEntryText(zip, entry, limits) {
73
+ assertEntryWithinLimits(entry, limits);
74
+ return new Promise((resolve, reject) => {
75
+ zip.openReadStream(entry, (error, stream) => {
76
+ if (error !== null) {
77
+ reject(toError(error, "failed to open docx entry stream"));
78
+ return;
79
+ }
80
+ const readStream = stream;
81
+ const chunks = [];
82
+ let inflatedBytes = 0;
83
+ let settled = false;
84
+ const rejectOnce = (streamError) => {
85
+ if (settled) {
86
+ return;
87
+ }
88
+ settled = true;
89
+ reject(streamError);
90
+ destroyStream(readStream, streamError);
91
+ };
92
+ readStream.on("data", (chunk) => {
93
+ if (settled) {
94
+ return;
95
+ }
96
+ inflatedBytes += chunk.byteLength;
97
+ if (inflatedBytes > limits.maxInflatedEntryBytes) {
98
+ rejectOnce(new Error("docx document.xml inflated stream exceeds limit"));
99
+ return;
100
+ }
101
+ chunks.push(chunk);
102
+ });
103
+ readStream.on("end", () => {
104
+ if (settled) {
105
+ return;
106
+ }
107
+ settled = true;
108
+ resolve(Buffer.concat(chunks).toString("utf8"));
109
+ });
110
+ readStream.on("error", (streamError) => {
111
+ rejectOnce(streamError);
112
+ });
113
+ });
114
+ });
115
+ }
116
+ function readDocumentXmlFromZip(zip, limits) {
117
+ return new Promise((resolve, reject) => {
118
+ let settled = false;
119
+ const resolveOnce = (value) => {
120
+ if (settled) {
121
+ return;
122
+ }
123
+ settled = true;
124
+ zip.removeListener("entry", onEntry);
125
+ zip.removeListener("end", onEnd);
126
+ zip.removeListener("error", onError);
127
+ resolve(value);
128
+ };
129
+ const rejectOnce = (error) => {
130
+ if (settled) {
131
+ return;
132
+ }
133
+ settled = true;
134
+ zip.removeListener("entry", onEntry);
135
+ zip.removeListener("end", onEnd);
136
+ zip.removeListener("error", onError);
137
+ reject(error);
138
+ };
139
+ const onEnd = () => {
140
+ rejectOnce(new Error("docx missing word/document.xml"));
141
+ };
142
+ const onError = (error) => {
143
+ rejectOnce(toError(error, "failed to read docx zip"));
144
+ };
145
+ const handleEntry = async (entry) => {
146
+ if (entry.fileName !== DOCUMENT_XML_ENTRY) {
147
+ zip.readEntry();
148
+ return;
149
+ }
150
+ try {
151
+ const xml = await readEntryText(zip, entry, limits);
152
+ resolveOnce(xml);
153
+ }
154
+ catch (error) {
155
+ rejectOnce(toError(error, "failed to read docx entry"));
156
+ }
157
+ };
158
+ const onEntry = (entry) => {
159
+ void handleEntry(entry);
160
+ };
161
+ zip.on("entry", onEntry);
162
+ zip.once("end", onEnd);
163
+ zip.once("error", onError);
164
+ zip.readEntry();
165
+ });
166
+ }
167
+ async function readDocumentXml(bytes, maxInputBytes) {
168
+ const zip = (await openZip(bytes));
169
+ try {
170
+ return await readDocumentXmlFromZip(zip, {
171
+ maxInflatedEntryBytes: maxInflatedEntryBytes(maxInputBytes),
172
+ maxInflateRatio: MAX_DOCUMENT_XML_INFLATE_RATIO,
173
+ });
174
+ }
175
+ finally {
176
+ closeZipQuietly(zip);
177
+ }
178
+ }
179
+ // GRD-027: delegate to the shared decoder so docx text runs also resolve numeric character
180
+ // references (smart quotes, accented letters) instead of surfacing literal `&#8217;`.
181
+ function decodeXmlEntities(value) {
182
+ return decodeXmlEntitiesShared(value);
183
+ }
184
+ function limitDiagnostic(input, limit) {
185
+ if (!limit.stop || limit.code === undefined || limit.message === undefined) {
186
+ return undefined;
187
+ }
188
+ return diagnostic(limit.code, limit.message, input.documentId, "info");
189
+ }
190
+ function headingLevelOf(paragraphXml) {
191
+ const match = HEADING_STYLE_PATTERN.exec(paragraphXml);
192
+ return match?.[1] === undefined ? undefined : Number(match[1]);
193
+ }
194
+ function paragraphText(paragraphXml) {
195
+ const withBreaks = paragraphXml
196
+ .replaceAll(/<w:tab\s*\/>/gi, "\t")
197
+ .replaceAll(/<w:br\s*\/>/gi, "\n")
198
+ .replaceAll(/<w:cr\s*\/>/gi, "\n");
199
+ const parts = Array.from(withBreaks.matchAll(TEXT_RUN_PATTERN))
200
+ .map((match) => decodeXmlEntities(match[1] ?? ""))
201
+ .filter((part) => part.length > 0);
202
+ return parts.join("").trim();
203
+ }
204
+ function parseParagraphs(xml, input, options, startedAt) {
205
+ const out = [];
206
+ const diagnostics = [];
207
+ let scannedParagraphs = 0;
208
+ for (const match of xml.matchAll(PARAGRAPH_PATTERN)) {
209
+ if (scannedParagraphs >= options.maxObjectsPerDocument) {
210
+ diagnostics.push(objectLimitDiagnostic(input.documentId, options.maxObjectsPerDocument));
211
+ break;
212
+ }
213
+ const limit = shouldStop(startedAt, options, scannedParagraphs);
214
+ const stopped = limitDiagnostic(input, limit);
215
+ if (stopped !== undefined) {
216
+ diagnostics.push(stopped);
217
+ break;
218
+ }
219
+ scannedParagraphs += 1;
220
+ const paragraphXml = match[0];
221
+ const text = paragraphText(paragraphXml);
222
+ if (text.length === 0)
223
+ continue;
224
+ const headingLevel = headingLevelOf(paragraphXml);
225
+ out.push(headingLevel === undefined ? { text } : { text, headingLevel });
226
+ }
227
+ return { paragraphs: out, diagnostics };
228
+ }
229
+ function paragraphStarts(paragraphs) {
230
+ const starts = [];
231
+ let cursor = 0;
232
+ for (const paragraph of paragraphs) {
233
+ starts.push(cursor);
234
+ cursor += paragraph.text.length + 1;
235
+ }
236
+ return { starts, end: Math.max(0, cursor - 1) };
237
+ }
238
+ function isHeadingEntry(entry) {
239
+ return entry.paragraph.headingLevel !== undefined;
240
+ }
241
+ function collectHeadings(paragraphs) {
242
+ return paragraphs.map((paragraph, index) => ({ paragraph, index })).filter(isHeadingEntry);
243
+ }
244
+ function unsupportedMediaUnit(documentId, reason) {
245
+ return { kind: "unsupported-media", documentId, reason };
246
+ }
247
+ function sectionUnit(section) {
248
+ return {
249
+ kind: "section",
250
+ documentId: section.documentId,
251
+ sectionPath: section.sectionPath,
252
+ characterStart: section.characterStart,
253
+ characterEnd: section.characterEnd,
254
+ };
255
+ }
256
+ function appendSectionRecord(sections, units, input, sectionPath, start, end) {
257
+ const sectionRecord = {
258
+ documentId: input.documentId,
259
+ sectionPath,
260
+ characterStart: start,
261
+ characterEnd: end,
262
+ };
263
+ sections.push(sectionRecord);
264
+ units.push(sectionUnit(sectionRecord));
265
+ }
266
+ function appendLimitedSectionRecord(sections, units, diagnostics, input, options, startedAt, sectionPath, start, end) {
267
+ const limit = shouldStop(startedAt, options, units.length);
268
+ const stopped = limitDiagnostic(input, limit);
269
+ if (stopped !== undefined) {
270
+ diagnostics.push(stopped);
271
+ return false;
272
+ }
273
+ appendSectionRecord(sections, units, input, sectionPath, start, end);
274
+ return true;
275
+ }
276
+ function buildUnsectionedSections(input, options, startedAt, end) {
277
+ const sections = [];
278
+ const units = [];
279
+ const diagnostics = [];
280
+ appendLimitedSectionRecord(sections, units, diagnostics, input, options, startedAt, [], 0, end);
281
+ return { sections, units, diagnostics };
282
+ }
283
+ function appendLeadingPreambleSection(sections, units, diagnostics, input, headings, offsets, options, startedAt) {
284
+ const firstHeading = headings[0];
285
+ if (firstHeading === undefined) {
286
+ return true;
287
+ }
288
+ const firstHeadingStart = offsets.starts[firstHeading.index] ?? 0;
289
+ if (firstHeadingStart > 0) {
290
+ return appendLimitedSectionRecord(sections, units, diagnostics, input, options, startedAt, [], 0, firstHeadingStart);
291
+ }
292
+ return true;
293
+ }
294
+ function appendHeadingSection(state, current, next) {
295
+ const level = current.paragraph.headingLevel;
296
+ while (state.stack.length >= level)
297
+ state.stack.pop();
298
+ state.stack.push(current.paragraph.text);
299
+ const start = state.offsets.starts[current.index] ?? 0;
300
+ const end = next === undefined
301
+ ? state.offsets.end
302
+ : (state.offsets.starts[next.index] ?? state.offsets.end);
303
+ return appendLimitedSectionRecord(state.sections, state.units, state.diagnostics, state.input, state.options, state.startedAt, [...state.stack], start, end);
304
+ }
305
+ function appendHeadingSections(state, headings) {
306
+ for (const [i, current] of headings.entries()) {
307
+ if (!appendHeadingSection(state, current, headings[i + 1])) {
308
+ return;
309
+ }
310
+ }
311
+ }
312
+ function buildSections(paragraphs, input, options, startedAt) {
313
+ const stack = [];
314
+ const sections = [];
315
+ const units = [];
316
+ const diagnostics = [];
317
+ const offsets = paragraphStarts(paragraphs);
318
+ const headings = collectHeadings(paragraphs);
319
+ if (headings.length === 0) {
320
+ return buildUnsectionedSections(input, options, startedAt, offsets.end);
321
+ }
322
+ if (!appendLeadingPreambleSection(sections, units, diagnostics, input, headings, offsets, options, startedAt)) {
323
+ return { sections, units, diagnostics };
324
+ }
325
+ appendHeadingSections({ sections, units, diagnostics, input, options, startedAt, offsets, stack }, headings);
326
+ return { sections, units, diagnostics };
327
+ }
328
+ function docxNoTextResult(capability, input, options) {
329
+ return emptyResult(capability, input.documentId, options, [
330
+ diagnostic("UNSUPPORTED_FORMAT", "docx has no extractable text content", input.documentId, "info"),
331
+ ], [unsupportedMediaUnit(input.documentId, "docx-no-text")]);
332
+ }
333
+ function docxParseResult(capability, input, options, paragraphs, diagnostics, startedAt) {
334
+ const built = buildSections(paragraphs, input, options, startedAt);
335
+ const normalizedText = paragraphs.map((paragraph) => paragraph.text).join("\n");
336
+ return {
337
+ documentId: input.documentId,
338
+ parser: parserIdentity(capability),
339
+ pages: [],
340
+ sections: built.sections,
341
+ units: built.units,
342
+ diagnostics: [...diagnostics, ...built.diagnostics],
343
+ extractedAt: options.now(),
344
+ normalizedText,
345
+ };
346
+ }
347
+ async function asyncParse(capability, input, options) {
348
+ if (input.bytes.byteLength > options.maxBytes) {
349
+ return emptyResult(capability, input.documentId, options, [
350
+ oversizeDiagnostic(input.documentId, input.bytes.byteLength, options.maxBytes),
351
+ ]);
352
+ }
353
+ if (options.signal?.aborted === true) {
354
+ return cancelled(capability, input, options);
355
+ }
356
+ const startedAt = options.now();
357
+ try {
358
+ const xml = await readDocumentXml(input.bytes, options.maxBytes);
359
+ const limit = shouldStop(startedAt, options, 0);
360
+ if (limit.stop && limit.code !== undefined && limit.message !== undefined) {
361
+ return emptyResult(capability, input.documentId, options, [
362
+ diagnostic(limit.code, limit.message, input.documentId, "info"),
363
+ ]);
364
+ }
365
+ const parsed = parseParagraphs(xml, input, options, startedAt);
366
+ if (parsed.paragraphs.length === 0) {
367
+ if (parsed.diagnostics.length > 0) {
368
+ return emptyResult(capability, input.documentId, options, parsed.diagnostics);
369
+ }
370
+ return docxNoTextResult(capability, input, options);
371
+ }
372
+ return docxParseResult(capability, input, options, parsed.paragraphs, parsed.diagnostics, startedAt);
373
+ }
374
+ catch {
375
+ return emptyResult(capability, input.documentId, options, [
376
+ diagnostic("MALFORMED_INPUT", "docx parser rejected malformed or unsupported document", input.documentId, "error"),
377
+ ]);
378
+ }
379
+ }
380
+ const capability = Object.freeze({
381
+ parserId: PARSER_ID,
382
+ parserVersion: PARSER_VERSION,
383
+ dependencyVersions: DEPENDENCY_VERSIONS,
384
+ matches: isDocx,
385
+ });
386
+ export const docxParser = Object.freeze({
387
+ capability,
388
+ parse: syncFallback(capability),
389
+ parseAsync: (input, options) => asyncParse(capability, input, options),
390
+ });
@@ -0,0 +1,3 @@
1
+ import type { ParserAdapter } from "./types.js";
2
+ export declare const htmlParser: ParserAdapter;
3
+ //# sourceMappingURL=html-parser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/html-parser.ts"],"names":[],"mappings":"AA0BA,OAAO,KAAK,EAEV,aAAa,EAGd,MAAM,YAAY,CAAC;AAyUpB,eAAO,MAAM,UAAU,EAAE,aA4BvB,CAAC"}