kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,294 @@
1
+ //! Chunk construction and building logic.
2
+ //!
3
+ //! This module handles the construction of individual chunks from text segments,
4
+ //! including overlap calculation, offset tracking, and metadata assembly.
5
+
6
+ use crate::error::{KreuzbergError, Result};
7
+ use crate::types::{Chunk, ChunkMetadata, PageBoundary};
8
+ use text_splitter::{Characters, ChunkCapacity, ChunkConfig};
9
+
10
+ use super::boundaries::calculate_page_range;
11
+
12
+ /// Build a ChunkConfig from chunking parameters.
13
+ ///
14
+ /// # Arguments
15
+ ///
16
+ /// * `max_characters` - Maximum characters per chunk
17
+ /// * `overlap` - Character overlap between consecutive chunks
18
+ /// * `trim` - Whether to trim whitespace from boundaries
19
+ ///
20
+ /// # Returns
21
+ ///
22
+ /// A configured ChunkConfig ready for use with text splitters.
23
+ ///
24
+ /// # Errors
25
+ ///
26
+ /// Returns `KreuzbergError::Validation` if configuration is invalid.
27
+ pub fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) -> Result<ChunkConfig<Characters>> {
28
+ ChunkConfig::new(ChunkCapacity::new(max_characters))
29
+ .with_overlap(overlap)
30
+ .map(|config| config.with_trim(trim))
31
+ .map_err(|e| KreuzbergError::validation(format!("Invalid chunking configuration: {}", e)))
32
+ }
33
+
34
+ /// Build chunks from text segments with optional page boundary tracking.
35
+ ///
36
+ /// This function takes a collection of text segments (produced by a text splitter)
37
+ /// and constructs Chunk objects with proper metadata, including:
38
+ /// - Byte offsets accounting for overlap
39
+ /// - Chunk indices and total count
40
+ /// - Page boundary information (if provided)
41
+ ///
42
+ /// # Arguments
43
+ ///
44
+ /// * `text_chunks` - Iterator of text segments to convert into chunks
45
+ /// * `overlap` - Number of characters to overlap between chunks
46
+ /// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
47
+ ///
48
+ /// # Returns
49
+ ///
50
+ /// A vector of Chunk objects with complete metadata.
51
+ ///
52
+ /// # Errors
53
+ ///
54
+ /// Returns an error if page boundary calculation fails.
55
+ pub fn build_chunks<'a, I>(
56
+ text_chunks: I,
57
+ overlap: usize,
58
+ page_boundaries: Option<&[PageBoundary]>,
59
+ ) -> Result<Vec<Chunk>>
60
+ where
61
+ I: IntoIterator<Item = &'a str>,
62
+ {
63
+ let chunks_vec: Vec<&str> = text_chunks.into_iter().collect();
64
+ let total_chunks = chunks_vec.len();
65
+ let mut byte_offset = 0;
66
+ let mut chunks = Vec::with_capacity(total_chunks);
67
+
68
+ for (index, chunk_text) in chunks_vec.into_iter().enumerate() {
69
+ let chunk = build_single_chunk(
70
+ chunk_text,
71
+ index,
72
+ total_chunks,
73
+ &mut byte_offset,
74
+ overlap,
75
+ page_boundaries,
76
+ )?;
77
+ chunks.push(chunk);
78
+ }
79
+
80
+ Ok(chunks)
81
+ }
82
+
83
+ /// Build a single chunk with metadata.
84
+ ///
85
+ /// # Arguments
86
+ ///
87
+ /// * `chunk_text` - The text content for this chunk
88
+ /// * `index` - Zero-based index of this chunk
89
+ /// * `total_chunks` - Total number of chunks in the collection
90
+ /// * `byte_offset` - Mutable reference to current byte offset (will be updated)
91
+ /// * `overlap` - Number of characters to overlap between chunks
92
+ /// * `page_boundaries` - Optional page boundary markers
93
+ ///
94
+ /// # Returns
95
+ ///
96
+ /// A complete Chunk object with all metadata filled in.
97
+ ///
98
+ /// # Errors
99
+ ///
100
+ /// Returns an error if page boundary calculation fails.
101
+ fn build_single_chunk(
102
+ chunk_text: &str,
103
+ index: usize,
104
+ total_chunks: usize,
105
+ byte_offset: &mut usize,
106
+ overlap: usize,
107
+ page_boundaries: Option<&[PageBoundary]>,
108
+ ) -> Result<Chunk> {
109
+ let byte_start = *byte_offset;
110
+ let chunk_length = chunk_text.len();
111
+ let byte_end = byte_start + chunk_length;
112
+
113
+ // Calculate overlap for next chunk (not applicable to last chunk)
114
+ let overlap_chars = if index < total_chunks - 1 {
115
+ overlap.min(chunk_length)
116
+ } else {
117
+ 0
118
+ };
119
+
120
+ // Update offset for next chunk, accounting for overlap
121
+ *byte_offset = byte_end - overlap_chars;
122
+
123
+ // Calculate page range if boundaries are provided
124
+ let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
125
+ calculate_page_range(byte_start, byte_end, boundaries)?
126
+ } else {
127
+ (None, None)
128
+ };
129
+
130
+ Ok(Chunk {
131
+ content: chunk_text.to_string(),
132
+ embedding: None,
133
+ metadata: ChunkMetadata {
134
+ byte_start,
135
+ byte_end,
136
+ token_count: None,
137
+ chunk_index: index,
138
+ total_chunks,
139
+ first_page,
140
+ last_page,
141
+ },
142
+ })
143
+ }
144
+
145
+ #[cfg(test)]
146
+ mod tests {
147
+ use super::*;
148
+
149
+ #[test]
150
+ fn test_build_chunk_config_valid() {
151
+ let result = build_chunk_config(100, 10, true);
152
+ assert!(result.is_ok());
153
+ }
154
+
155
+ #[test]
156
+ fn test_build_chunk_config_invalid_overlap() {
157
+ let result = build_chunk_config(10, 20, true);
158
+ assert!(result.is_err());
159
+ let err = result.unwrap_err();
160
+ assert!(matches!(err, KreuzbergError::Validation { .. }));
161
+ }
162
+
163
+ #[test]
164
+ fn test_build_chunks_empty() {
165
+ let text_chunks: Vec<&str> = vec![];
166
+ let result = build_chunks(text_chunks, 5, None).unwrap();
167
+ assert_eq!(result.len(), 0);
168
+ }
169
+
170
+ #[test]
171
+ fn test_build_chunks_single() {
172
+ let text_chunks = vec!["Single chunk"];
173
+ let result = build_chunks(text_chunks, 5, None).unwrap();
174
+ assert_eq!(result.len(), 1);
175
+ assert_eq!(result[0].content, "Single chunk");
176
+ assert_eq!(result[0].metadata.chunk_index, 0);
177
+ assert_eq!(result[0].metadata.total_chunks, 1);
178
+ assert_eq!(result[0].metadata.byte_start, 0);
179
+ assert_eq!(result[0].metadata.byte_end, 12);
180
+ }
181
+
182
+ #[test]
183
+ fn test_build_chunks_multiple_with_overlap() {
184
+ let text_chunks = vec!["First chunk here", "Second chunk here", "Third chunk here"];
185
+ let overlap = 5;
186
+ let result = build_chunks(text_chunks, overlap, None).unwrap();
187
+
188
+ assert_eq!(result.len(), 3);
189
+
190
+ // First chunk
191
+ assert_eq!(result[0].content, "First chunk here");
192
+ assert_eq!(result[0].metadata.byte_start, 0);
193
+ assert_eq!(result[0].metadata.byte_end, 16);
194
+
195
+ // Second chunk should start before first ends (overlap)
196
+ assert!(result[1].metadata.byte_start < result[0].metadata.byte_end);
197
+
198
+ // Third chunk should start before second ends (overlap)
199
+ assert!(result[2].metadata.byte_start < result[1].metadata.byte_end);
200
+ }
201
+
202
+ #[test]
203
+ fn test_build_chunks_with_page_boundaries() {
204
+ let text_chunks = vec!["First chunk", "Second chunk"];
205
+ let boundaries = vec![
206
+ PageBoundary {
207
+ byte_start: 0,
208
+ byte_end: 11,
209
+ page_number: 1,
210
+ },
211
+ PageBoundary {
212
+ byte_start: 11,
213
+ byte_end: 23,
214
+ page_number: 2,
215
+ },
216
+ ];
217
+
218
+ let result = build_chunks(text_chunks, 0, Some(&boundaries)).unwrap();
219
+
220
+ assert_eq!(result.len(), 2);
221
+ assert_eq!(result[0].metadata.first_page, Some(1));
222
+ assert_eq!(result[1].metadata.first_page, Some(2));
223
+ }
224
+
225
+ #[test]
226
+ fn test_build_chunks_offset_tracking() {
227
+ let text_chunks = vec!["AAAAA", "BBBBB", "CCCCC"];
228
+ let overlap = 2;
229
+ let result = build_chunks(text_chunks, overlap, None).unwrap();
230
+
231
+ assert_eq!(result.len(), 3);
232
+
233
+ // First chunk: 0-5
234
+ assert_eq!(result[0].metadata.byte_start, 0);
235
+ assert_eq!(result[0].metadata.byte_end, 5);
236
+
237
+ // Second chunk: 3-8 (overlap of 2)
238
+ assert_eq!(result[1].metadata.byte_start, 3);
239
+ assert_eq!(result[1].metadata.byte_end, 8);
240
+
241
+ // Third chunk: 6-11 (overlap of 2, but last chunk so no further adjustment)
242
+ assert_eq!(result[2].metadata.byte_start, 6);
243
+ assert_eq!(result[2].metadata.byte_end, 11);
244
+ }
245
+
246
+ #[test]
247
+ fn test_build_single_chunk_metadata() {
248
+ let mut offset = 0;
249
+ let chunk = build_single_chunk("Test content", 0, 1, &mut offset, 5, None).unwrap();
250
+
251
+ assert_eq!(chunk.content, "Test content");
252
+ assert_eq!(chunk.metadata.byte_start, 0);
253
+ assert_eq!(chunk.metadata.byte_end, 12);
254
+ assert_eq!(chunk.metadata.chunk_index, 0);
255
+ assert_eq!(chunk.metadata.total_chunks, 1);
256
+ assert_eq!(chunk.metadata.first_page, None);
257
+ assert_eq!(chunk.metadata.last_page, None);
258
+ }
259
+
260
+ #[test]
261
+ fn test_build_single_chunk_with_overlap() {
262
+ let mut offset = 0;
263
+
264
+ // First chunk
265
+ let chunk1 = build_single_chunk("0123456789", 0, 2, &mut offset, 3, None).unwrap();
266
+ assert_eq!(chunk1.metadata.byte_start, 0);
267
+ assert_eq!(chunk1.metadata.byte_end, 10);
268
+ assert_eq!(offset, 7); // 10 - 3 (overlap)
269
+
270
+ // Second chunk
271
+ let chunk2 = build_single_chunk("ABCDEFGHIJ", 1, 2, &mut offset, 3, None).unwrap();
272
+ assert_eq!(chunk2.metadata.byte_start, 7);
273
+ assert_eq!(chunk2.metadata.byte_end, 17);
274
+ assert_eq!(offset, 17); // Last chunk, no overlap subtracted
275
+ }
276
+
277
+ #[test]
278
+ fn test_build_chunks_no_overlap() {
279
+ let text_chunks = vec!["AAAAA", "BBBBB", "CCCCC"];
280
+ let result = build_chunks(text_chunks, 0, None).unwrap();
281
+
282
+ assert_eq!(result.len(), 3);
283
+
284
+ // Chunks should be contiguous with no overlap
285
+ assert_eq!(result[0].metadata.byte_start, 0);
286
+ assert_eq!(result[0].metadata.byte_end, 5);
287
+
288
+ assert_eq!(result[1].metadata.byte_start, 5);
289
+ assert_eq!(result[1].metadata.byte_end, 10);
290
+
291
+ assert_eq!(result[2].metadata.byte_start, 10);
292
+ assert_eq!(result[2].metadata.byte_end, 15);
293
+ }
294
+ }
@@ -0,0 +1,52 @@
1
+ //! Configuration types for text chunking.
2
+
3
+ use serde::{Deserialize, Serialize};
4
+
5
+ /// Configuration options for text chunking operations.
6
+ ///
7
+ /// # Fields
8
+ ///
9
+ /// * `max_characters` - Maximum number of characters per chunk (default: 2000)
10
+ /// * `overlap` - Number of characters to overlap between consecutive chunks (default: 100)
11
+ /// * `trim` - Whether to trim whitespace from chunk boundaries (default: true)
12
+ /// * `chunker_type` - Type of chunker to use (Text or Markdown) (default: Text)
13
+ pub struct ChunkingConfig {
14
+ pub max_characters: usize,
15
+ pub overlap: usize,
16
+ pub trim: bool,
17
+ pub chunker_type: ChunkerType,
18
+ }
19
+
20
+ impl Default for ChunkingConfig {
21
+ fn default() -> Self {
22
+ Self {
23
+ max_characters: 2000,
24
+ overlap: 100,
25
+ trim: true,
26
+ chunker_type: ChunkerType::Text,
27
+ }
28
+ }
29
+ }
30
+
31
+ /// Type of text chunker to use.
32
+ ///
33
+ /// # Variants
34
+ ///
35
+ /// * `Text` - Generic text splitter, splits on whitespace and punctuation
36
+ /// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
37
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
38
+ pub enum ChunkerType {
39
+ Text,
40
+ Markdown,
41
+ }
42
+
43
+ /// Result of a text chunking operation.
44
+ ///
45
+ /// Contains the generated chunks and metadata about the chunking.
46
+ #[derive(Debug, Clone, Serialize, Deserialize)]
47
+ pub struct ChunkingResult {
48
+ /// List of text chunks
49
+ pub chunks: Vec<crate::types::Chunk>,
50
+ /// Total number of chunks generated
51
+ pub chunk_count: usize,
52
+ }