kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,294 @@
1
+ //! Chunk construction and building logic.
2
+ //!
3
+ //! This module handles the construction of individual chunks from text segments,
4
+ //! including overlap calculation, offset tracking, and metadata assembly.
5
+
6
+ use crate::error::{KreuzbergError, Result};
7
+ use crate::types::{Chunk, ChunkMetadata, PageBoundary};
8
+ use text_splitter::{Characters, ChunkCapacity, ChunkConfig};
9
+
10
+ use super::boundaries::calculate_page_range;
11
+
12
+ /// Build a ChunkConfig from chunking parameters.
13
+ ///
14
+ /// # Arguments
15
+ ///
16
+ /// * `max_characters` - Maximum characters per chunk
17
+ /// * `overlap` - Character overlap between consecutive chunks
18
+ /// * `trim` - Whether to trim whitespace from boundaries
19
+ ///
20
+ /// # Returns
21
+ ///
22
+ /// A configured ChunkConfig ready for use with text splitters.
23
+ ///
24
+ /// # Errors
25
+ ///
26
+ /// Returns `KreuzbergError::Validation` if configuration is invalid.
27
+ pub fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) -> Result<ChunkConfig<Characters>> {
28
+ ChunkConfig::new(ChunkCapacity::new(max_characters))
29
+ .with_overlap(overlap)
30
+ .map(|config| config.with_trim(trim))
31
+ .map_err(|e| KreuzbergError::validation(format!("Invalid chunking configuration: {}", e)))
32
+ }
33
+
34
+ /// Build chunks from text segments with optional page boundary tracking.
35
+ ///
36
+ /// This function takes a collection of text segments (produced by a text splitter)
37
+ /// and constructs Chunk objects with proper metadata, including:
38
+ /// - Byte offsets accounting for overlap
39
+ /// - Chunk indices and total count
40
+ /// - Page boundary information (if provided)
41
+ ///
42
+ /// # Arguments
43
+ ///
44
+ /// * `text_chunks` - Iterator of text segments to convert into chunks
45
+ /// * `overlap` - Number of characters to overlap between chunks
46
+ /// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
47
+ ///
48
+ /// # Returns
49
+ ///
50
+ /// A vector of Chunk objects with complete metadata.
51
+ ///
52
+ /// # Errors
53
+ ///
54
+ /// Returns an error if page boundary calculation fails.
55
+ pub fn build_chunks<'a, I>(
56
+ text_chunks: I,
57
+ overlap: usize,
58
+ page_boundaries: Option<&[PageBoundary]>,
59
+ ) -> Result<Vec<Chunk>>
60
+ where
61
+ I: IntoIterator<Item = &'a str>,
62
+ {
63
+ let chunks_vec: Vec<&str> = text_chunks.into_iter().collect();
64
+ let total_chunks = chunks_vec.len();
65
+ let mut byte_offset = 0;
66
+ let mut chunks = Vec::with_capacity(total_chunks);
67
+
68
+ for (index, chunk_text) in chunks_vec.into_iter().enumerate() {
69
+ let chunk = build_single_chunk(
70
+ chunk_text,
71
+ index,
72
+ total_chunks,
73
+ &mut byte_offset,
74
+ overlap,
75
+ page_boundaries,
76
+ )?;
77
+ chunks.push(chunk);
78
+ }
79
+
80
+ Ok(chunks)
81
+ }
82
+
83
+ /// Build a single chunk with metadata.
84
+ ///
85
+ /// # Arguments
86
+ ///
87
+ /// * `chunk_text` - The text content for this chunk
88
+ /// * `index` - Zero-based index of this chunk
89
+ /// * `total_chunks` - Total number of chunks in the collection
90
+ /// * `byte_offset` - Mutable reference to current byte offset (will be updated)
91
+ /// * `overlap` - Number of characters to overlap between chunks
92
+ /// * `page_boundaries` - Optional page boundary markers
93
+ ///
94
+ /// # Returns
95
+ ///
96
+ /// A complete Chunk object with all metadata filled in.
97
+ ///
98
+ /// # Errors
99
+ ///
100
+ /// Returns an error if page boundary calculation fails.
101
+ fn build_single_chunk(
102
+ chunk_text: &str,
103
+ index: usize,
104
+ total_chunks: usize,
105
+ byte_offset: &mut usize,
106
+ overlap: usize,
107
+ page_boundaries: Option<&[PageBoundary]>,
108
+ ) -> Result<Chunk> {
109
+ let byte_start = *byte_offset;
110
+ let chunk_length = chunk_text.len();
111
+ let byte_end = byte_start + chunk_length;
112
+
113
+ // Calculate overlap for next chunk (not applicable to last chunk)
114
+ let overlap_chars = if index < total_chunks - 1 {
115
+ overlap.min(chunk_length)
116
+ } else {
117
+ 0
118
+ };
119
+
120
+ // Update offset for next chunk, accounting for overlap
121
+ *byte_offset = byte_end - overlap_chars;
122
+
123
+ // Calculate page range if boundaries are provided
124
+ let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
125
+ calculate_page_range(byte_start, byte_end, boundaries)?
126
+ } else {
127
+ (None, None)
128
+ };
129
+
130
+ Ok(Chunk {
131
+ content: chunk_text.to_string(),
132
+ embedding: None,
133
+ metadata: ChunkMetadata {
134
+ byte_start,
135
+ byte_end,
136
+ token_count: None,
137
+ chunk_index: index,
138
+ total_chunks,
139
+ first_page,
140
+ last_page,
141
+ },
142
+ })
143
+ }
144
+
145
+ #[cfg(test)]
146
+ mod tests {
147
+ use super::*;
148
+
149
+ #[test]
150
+ fn test_build_chunk_config_valid() {
151
+ let result = build_chunk_config(100, 10, true);
152
+ assert!(result.is_ok());
153
+ }
154
+
155
+ #[test]
156
+ fn test_build_chunk_config_invalid_overlap() {
157
+ let result = build_chunk_config(10, 20, true);
158
+ assert!(result.is_err());
159
+ let err = result.unwrap_err();
160
+ assert!(matches!(err, KreuzbergError::Validation { .. }));
161
+ }
162
+
163
+ #[test]
164
+ fn test_build_chunks_empty() {
165
+ let text_chunks: Vec<&str> = vec![];
166
+ let result = build_chunks(text_chunks, 5, None).unwrap();
167
+ assert_eq!(result.len(), 0);
168
+ }
169
+
170
+ #[test]
171
+ fn test_build_chunks_single() {
172
+ let text_chunks = vec!["Single chunk"];
173
+ let result = build_chunks(text_chunks, 5, None).unwrap();
174
+ assert_eq!(result.len(), 1);
175
+ assert_eq!(result[0].content, "Single chunk");
176
+ assert_eq!(result[0].metadata.chunk_index, 0);
177
+ assert_eq!(result[0].metadata.total_chunks, 1);
178
+ assert_eq!(result[0].metadata.byte_start, 0);
179
+ assert_eq!(result[0].metadata.byte_end, 12);
180
+ }
181
+
182
+ #[test]
183
+ fn test_build_chunks_multiple_with_overlap() {
184
+ let text_chunks = vec!["First chunk here", "Second chunk here", "Third chunk here"];
185
+ let overlap = 5;
186
+ let result = build_chunks(text_chunks, overlap, None).unwrap();
187
+
188
+ assert_eq!(result.len(), 3);
189
+
190
+ // First chunk
191
+ assert_eq!(result[0].content, "First chunk here");
192
+ assert_eq!(result[0].metadata.byte_start, 0);
193
+ assert_eq!(result[0].metadata.byte_end, 16);
194
+
195
+ // Second chunk should start before first ends (overlap)
196
+ assert!(result[1].metadata.byte_start < result[0].metadata.byte_end);
197
+
198
+ // Third chunk should start before second ends (overlap)
199
+ assert!(result[2].metadata.byte_start < result[1].metadata.byte_end);
200
+ }
201
+
202
+ #[test]
203
+ fn test_build_chunks_with_page_boundaries() {
204
+ let text_chunks = vec!["First chunk", "Second chunk"];
205
+ let boundaries = vec![
206
+ PageBoundary {
207
+ byte_start: 0,
208
+ byte_end: 11,
209
+ page_number: 1,
210
+ },
211
+ PageBoundary {
212
+ byte_start: 11,
213
+ byte_end: 23,
214
+ page_number: 2,
215
+ },
216
+ ];
217
+
218
+ let result = build_chunks(text_chunks, 0, Some(&boundaries)).unwrap();
219
+
220
+ assert_eq!(result.len(), 2);
221
+ assert_eq!(result[0].metadata.first_page, Some(1));
222
+ assert_eq!(result[1].metadata.first_page, Some(2));
223
+ }
224
+
225
+ #[test]
226
+ fn test_build_chunks_offset_tracking() {
227
+ let text_chunks = vec!["AAAAA", "BBBBB", "CCCCC"];
228
+ let overlap = 2;
229
+ let result = build_chunks(text_chunks, overlap, None).unwrap();
230
+
231
+ assert_eq!(result.len(), 3);
232
+
233
+ // First chunk: 0-5
234
+ assert_eq!(result[0].metadata.byte_start, 0);
235
+ assert_eq!(result[0].metadata.byte_end, 5);
236
+
237
+ // Second chunk: 3-8 (overlap of 2)
238
+ assert_eq!(result[1].metadata.byte_start, 3);
239
+ assert_eq!(result[1].metadata.byte_end, 8);
240
+
241
+ // Third chunk: 6-11 (overlap of 2, but last chunk so no further adjustment)
242
+ assert_eq!(result[2].metadata.byte_start, 6);
243
+ assert_eq!(result[2].metadata.byte_end, 11);
244
+ }
245
+
246
+ #[test]
247
+ fn test_build_single_chunk_metadata() {
248
+ let mut offset = 0;
249
+ let chunk = build_single_chunk("Test content", 0, 1, &mut offset, 5, None).unwrap();
250
+
251
+ assert_eq!(chunk.content, "Test content");
252
+ assert_eq!(chunk.metadata.byte_start, 0);
253
+ assert_eq!(chunk.metadata.byte_end, 12);
254
+ assert_eq!(chunk.metadata.chunk_index, 0);
255
+ assert_eq!(chunk.metadata.total_chunks, 1);
256
+ assert_eq!(chunk.metadata.first_page, None);
257
+ assert_eq!(chunk.metadata.last_page, None);
258
+ }
259
+
260
+ #[test]
261
+ fn test_build_single_chunk_with_overlap() {
262
+ let mut offset = 0;
263
+
264
+ // First chunk
265
+ let chunk1 = build_single_chunk("0123456789", 0, 2, &mut offset, 3, None).unwrap();
266
+ assert_eq!(chunk1.metadata.byte_start, 0);
267
+ assert_eq!(chunk1.metadata.byte_end, 10);
268
+ assert_eq!(offset, 7); // 10 - 3 (overlap)
269
+
270
+ // Second chunk
271
+ let chunk2 = build_single_chunk("ABCDEFGHIJ", 1, 2, &mut offset, 3, None).unwrap();
272
+ assert_eq!(chunk2.metadata.byte_start, 7);
273
+ assert_eq!(chunk2.metadata.byte_end, 17);
274
+ assert_eq!(offset, 17); // Last chunk, no overlap subtracted
275
+ }
276
+
277
+ #[test]
278
+ fn test_build_chunks_no_overlap() {
279
+ let text_chunks = vec!["AAAAA", "BBBBB", "CCCCC"];
280
+ let result = build_chunks(text_chunks, 0, None).unwrap();
281
+
282
+ assert_eq!(result.len(), 3);
283
+
284
+ // Chunks should be contiguous with no overlap
285
+ assert_eq!(result[0].metadata.byte_start, 0);
286
+ assert_eq!(result[0].metadata.byte_end, 5);
287
+
288
+ assert_eq!(result[1].metadata.byte_start, 5);
289
+ assert_eq!(result[1].metadata.byte_end, 10);
290
+
291
+ assert_eq!(result[2].metadata.byte_start, 10);
292
+ assert_eq!(result[2].metadata.byte_end, 15);
293
+ }
294
+ }
@@ -0,0 +1,52 @@
1
+ //! Configuration types for text chunking.
2
+
3
+ use serde::{Deserialize, Serialize};
4
+
5
+ /// Configuration options for text chunking operations.
6
+ ///
7
+ /// # Fields
8
+ ///
9
+ /// * `max_characters` - Maximum number of characters per chunk (default: 2000)
10
+ /// * `overlap` - Number of characters to overlap between consecutive chunks (default: 100)
11
+ /// * `trim` - Whether to trim whitespace from chunk boundaries (default: true)
12
+ /// * `chunker_type` - Type of chunker to use (Text or Markdown) (default: Text)
13
+ pub struct ChunkingConfig {
14
+ pub max_characters: usize,
15
+ pub overlap: usize,
16
+ pub trim: bool,
17
+ pub chunker_type: ChunkerType,
18
+ }
19
+
20
+ impl Default for ChunkingConfig {
21
+ fn default() -> Self {
22
+ Self {
23
+ max_characters: 2000,
24
+ overlap: 100,
25
+ trim: true,
26
+ chunker_type: ChunkerType::Text,
27
+ }
28
+ }
29
+ }
30
+
31
+ /// Type of text chunker to use.
32
+ ///
33
+ /// # Variants
34
+ ///
35
+ /// * `Text` - Generic text splitter, splits on whitespace and punctuation
36
+ /// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
37
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
38
+ pub enum ChunkerType {
39
+ Text,
40
+ Markdown,
41
+ }
42
+
43
+ /// Result of a text chunking operation.
44
+ ///
45
+ /// Contains the generated chunks and metadata about the chunking.
46
+ #[derive(Debug, Clone, Serialize, Deserialize)]
47
+ pub struct ChunkingResult {
48
+ /// List of text chunks
49
+ pub chunks: Vec<crate::types::Chunk>,
50
+ /// Total number of chunks generated
51
+ pub chunk_count: usize,
52
+ }