kreuzberg 4.0.7 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +24 -16
  3. data/README.md +4 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,1017 @@
1
+ //! Core text chunking logic and public API.
2
+ //!
3
+ //! This module implements the main chunking algorithms and provides the primary
4
+ //! public API functions for splitting text into chunks.
5
+
6
+ use crate::error::Result;
7
+ use crate::types::PageBoundary;
8
+ use text_splitter::{MarkdownSplitter, TextSplitter};
9
+
10
+ use super::builder::{build_chunk_config, build_chunks};
11
+ use super::config::{ChunkerType, ChunkingConfig, ChunkingResult};
12
+ use super::validation::validate_utf8_boundaries;
13
+
14
+ /// Split text into chunks with optional page boundary tracking.
15
+ ///
16
+ /// This is the primary API function for chunking text. It supports both plain text
17
+ /// and Markdown with configurable chunk size, overlap, and page boundary mapping.
18
+ ///
19
+ /// # Arguments
20
+ ///
21
+ /// * `text` - The text to split into chunks
22
+ /// * `config` - Chunking configuration (max size, overlap, type)
23
+ /// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
24
+ ///
25
+ /// # Returns
26
+ ///
27
+ /// A ChunkingResult containing all chunks and their metadata.
28
+ ///
29
+ /// # Examples
30
+ ///
31
+ /// ```rust
32
+ /// use kreuzberg::chunking::{chunk_text, ChunkingConfig, ChunkerType};
33
+ ///
34
+ /// # fn example() -> kreuzberg::Result<()> {
35
+ /// let config = ChunkingConfig {
36
+ /// max_characters: 500,
37
+ /// overlap: 50,
38
+ /// trim: true,
39
+ /// chunker_type: ChunkerType::Text,
40
+ /// };
41
+ /// let result = chunk_text("Long text...", &config, None)?;
42
+ /// assert!(!result.chunks.is_empty());
43
+ /// # Ok(())
44
+ /// # }
45
+ /// ```
46
+ pub fn chunk_text(
47
+ text: &str,
48
+ config: &ChunkingConfig,
49
+ page_boundaries: Option<&[PageBoundary]>,
50
+ ) -> Result<ChunkingResult> {
51
+ if text.is_empty() {
52
+ return Ok(ChunkingResult {
53
+ chunks: vec![],
54
+ chunk_count: 0,
55
+ });
56
+ }
57
+
58
+ if let Some(boundaries) = page_boundaries {
59
+ validate_utf8_boundaries(text, boundaries)?;
60
+ }
61
+
62
+ let chunk_config = build_chunk_config(config.max_characters, config.overlap, config.trim)?;
63
+
64
+ let text_chunks: Vec<&str> = match config.chunker_type {
65
+ ChunkerType::Text => {
66
+ let splitter = TextSplitter::new(chunk_config);
67
+ splitter.chunks(text).collect()
68
+ }
69
+ ChunkerType::Markdown => {
70
+ let splitter = MarkdownSplitter::new(chunk_config);
71
+ splitter.chunks(text).collect()
72
+ }
73
+ };
74
+
75
+ let chunks = build_chunks(text_chunks.into_iter(), config.overlap, page_boundaries)?;
76
+ let chunk_count = chunks.len();
77
+
78
+ Ok(ChunkingResult { chunks, chunk_count })
79
+ }
80
+
81
+ /// Chunk text with explicit type specification.
82
+ ///
83
+ /// This is a convenience function that constructs a ChunkingConfig from individual
84
+ /// parameters and calls `chunk_text`.
85
+ ///
86
+ /// # Arguments
87
+ ///
88
+ /// * `text` - The text to split into chunks
89
+ /// * `max_characters` - Maximum characters per chunk
90
+ /// * `overlap` - Character overlap between consecutive chunks
91
+ /// * `trim` - Whether to trim whitespace from boundaries
92
+ /// * `chunker_type` - Type of chunker to use (Text or Markdown)
93
+ ///
94
+ /// # Returns
95
+ ///
96
+ /// A ChunkingResult containing all chunks and their metadata.
97
+ ///
98
+ /// # Examples
99
+ ///
100
+ /// ```rust
101
+ /// use kreuzberg::chunking::{chunk_text_with_type, ChunkerType};
102
+ ///
103
+ /// # fn example() -> kreuzberg::Result<()> {
104
+ /// let result = chunk_text_with_type("Some text", 500, 50, true, ChunkerType::Text)?;
105
+ /// assert!(!result.chunks.is_empty());
106
+ /// # Ok(())
107
+ /// # }
108
+ /// ```
109
+ pub fn chunk_text_with_type(
110
+ text: &str,
111
+ max_characters: usize,
112
+ overlap: usize,
113
+ trim: bool,
114
+ chunker_type: ChunkerType,
115
+ ) -> Result<ChunkingResult> {
116
+ let config = ChunkingConfig {
117
+ max_characters,
118
+ overlap,
119
+ trim,
120
+ chunker_type,
121
+ };
122
+ chunk_text(text, &config, None)
123
+ }
124
+
125
+ /// Batch process multiple texts with the same configuration.
126
+ ///
127
+ /// This convenience function applies the same chunking configuration to multiple
128
+ /// texts in sequence.
129
+ ///
130
+ /// # Arguments
131
+ ///
132
+ /// * `texts` - Slice of text strings to chunk
133
+ /// * `config` - Chunking configuration to apply to all texts
134
+ ///
135
+ /// # Returns
136
+ ///
137
+ /// A vector of ChunkingResult objects, one per input text.
138
+ ///
139
+ /// # Errors
140
+ ///
141
+ /// Returns an error if chunking any individual text fails.
142
+ ///
143
+ /// # Examples
144
+ ///
145
+ /// ```rust
146
+ /// use kreuzberg::chunking::{chunk_texts_batch, ChunkingConfig};
147
+ ///
148
+ /// # fn example() -> kreuzberg::Result<()> {
149
+ /// let config = ChunkingConfig::default();
150
+ /// let texts = vec!["First text", "Second text"];
151
+ /// let results = chunk_texts_batch(&texts, &config)?;
152
+ /// assert_eq!(results.len(), 2);
153
+ /// # Ok(())
154
+ /// # }
155
+ /// ```
156
+ pub fn chunk_texts_batch(texts: &[&str], config: &ChunkingConfig) -> Result<Vec<ChunkingResult>> {
157
+ texts.iter().map(|text| chunk_text(text, config, None)).collect()
158
+ }
159
+
160
+ #[cfg(test)]
161
+ mod tests {
162
+ use super::*;
163
+ use crate::KreuzbergError;
164
+
165
+ #[test]
166
+ fn test_chunk_empty_text() {
167
+ let config = ChunkingConfig::default();
168
+ let result = chunk_text("", &config, None).unwrap();
169
+ assert_eq!(result.chunks.len(), 0);
170
+ assert_eq!(result.chunk_count, 0);
171
+ }
172
+
173
+ #[test]
174
+ fn test_chunk_short_text_single_chunk() {
175
+ let config = ChunkingConfig {
176
+ max_characters: 100,
177
+ overlap: 10,
178
+ trim: true,
179
+ chunker_type: ChunkerType::Text,
180
+ };
181
+ let text = "This is a short text.";
182
+ let result = chunk_text(text, &config, None).unwrap();
183
+ assert_eq!(result.chunks.len(), 1);
184
+ assert_eq!(result.chunk_count, 1);
185
+ assert_eq!(result.chunks[0].content, text);
186
+ }
187
+
188
+ #[test]
189
+ fn test_chunk_long_text_multiple_chunks() {
190
+ let config = ChunkingConfig {
191
+ max_characters: 20,
192
+ overlap: 5,
193
+ trim: true,
194
+ chunker_type: ChunkerType::Text,
195
+ };
196
+ let text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
197
+ let result = chunk_text(text, &config, None).unwrap();
198
+ assert!(result.chunk_count >= 2);
199
+ assert_eq!(result.chunks.len(), result.chunk_count);
200
+ assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 20));
201
+ }
202
+
203
+ #[test]
204
+ fn test_chunk_text_with_overlap() {
205
+ let config = ChunkingConfig {
206
+ max_characters: 20,
207
+ overlap: 5,
208
+ trim: true,
209
+ chunker_type: ChunkerType::Text,
210
+ };
211
+ let text = "abcdefghijklmnopqrstuvwxyz0123456789";
212
+ let result = chunk_text(text, &config, None).unwrap();
213
+ assert!(result.chunk_count >= 2);
214
+
215
+ if result.chunks.len() >= 2 {
216
+ let first_chunk_end = &result.chunks[0].content[result.chunks[0].content.len().saturating_sub(5)..];
217
+ assert!(
218
+ result.chunks[1].content.starts_with(first_chunk_end),
219
+ "Expected overlap '{}' at start of second chunk '{}'",
220
+ first_chunk_end,
221
+ result.chunks[1].content
222
+ );
223
+ }
224
+ }
225
+
226
+ #[test]
227
+ fn test_chunk_markdown_preserves_structure() {
228
+ let config = ChunkingConfig {
229
+ max_characters: 50,
230
+ overlap: 10,
231
+ trim: true,
232
+ chunker_type: ChunkerType::Markdown,
233
+ };
234
+ let markdown = "# Title\n\nParagraph one.\n\n## Section\n\nParagraph two.";
235
+ let result = chunk_text(markdown, &config, None).unwrap();
236
+ assert!(result.chunk_count >= 1);
237
+ assert!(result.chunks.iter().any(|chunk| chunk.content.contains("# Title")));
238
+ }
239
+
240
+ #[test]
241
+ fn test_chunk_markdown_with_code_blocks() {
242
+ let config = ChunkingConfig {
243
+ max_characters: 100,
244
+ overlap: 10,
245
+ trim: true,
246
+ chunker_type: ChunkerType::Markdown,
247
+ };
248
+ let markdown = "# Code Example\n\n```python\nprint('hello')\n```\n\nSome text after code.";
249
+ let result = chunk_text(markdown, &config, None).unwrap();
250
+ assert!(result.chunk_count >= 1);
251
+ assert!(result.chunks.iter().any(|chunk| chunk.content.contains("```")));
252
+ }
253
+
254
+ #[test]
255
+ fn test_chunk_markdown_with_links() {
256
+ let config = ChunkingConfig {
257
+ max_characters: 80,
258
+ overlap: 10,
259
+ trim: true,
260
+ chunker_type: ChunkerType::Markdown,
261
+ };
262
+ let markdown = "Check out [this link](https://example.com) for more info.";
263
+ let result = chunk_text(markdown, &config, None).unwrap();
264
+ assert_eq!(result.chunk_count, 1);
265
+ assert!(result.chunks[0].content.contains("[this link]"));
266
+ }
267
+
268
+ #[test]
269
+ fn test_chunk_text_with_trim() {
270
+ let config = ChunkingConfig {
271
+ max_characters: 30,
272
+ overlap: 5,
273
+ trim: true,
274
+ chunker_type: ChunkerType::Text,
275
+ };
276
+ let text = " Leading and trailing spaces should be trimmed ";
277
+ let result = chunk_text(text, &config, None).unwrap();
278
+ assert!(result.chunk_count >= 1);
279
+ assert!(result.chunks.iter().all(|chunk| !chunk.content.starts_with(' ')));
280
+ }
281
+
282
+ #[test]
283
+ fn test_chunk_text_without_trim() {
284
+ let config = ChunkingConfig {
285
+ max_characters: 30,
286
+ overlap: 5,
287
+ trim: false,
288
+ chunker_type: ChunkerType::Text,
289
+ };
290
+ let text = " Text with spaces ";
291
+ let result = chunk_text(text, &config, None).unwrap();
292
+ assert_eq!(result.chunk_count, 1);
293
+ assert!(result.chunks[0].content.starts_with(' ') || result.chunks[0].content.len() < text.len());
294
+ }
295
+
296
+ #[test]
297
+ fn test_chunk_with_invalid_overlap() {
298
+ let config = ChunkingConfig {
299
+ max_characters: 10,
300
+ overlap: 20,
301
+ trim: true,
302
+ chunker_type: ChunkerType::Text,
303
+ };
304
+ let result = chunk_text("Some text", &config, None);
305
+ assert!(result.is_err());
306
+ let err = result.unwrap_err();
307
+ assert!(matches!(err, KreuzbergError::Validation { .. }));
308
+ }
309
+
310
+ #[test]
311
+ fn test_chunk_text_with_type_text() {
312
+ let result = chunk_text_with_type("Simple text", 50, 10, true, ChunkerType::Text).unwrap();
313
+ assert_eq!(result.chunk_count, 1);
314
+ assert_eq!(result.chunks[0].content, "Simple text");
315
+ }
316
+
317
+ #[test]
318
+ fn test_chunk_text_with_type_markdown() {
319
+ let markdown = "# Header\n\nContent here.";
320
+ let result = chunk_text_with_type(markdown, 50, 10, true, ChunkerType::Markdown).unwrap();
321
+ assert_eq!(result.chunk_count, 1);
322
+ assert!(result.chunks[0].content.contains("# Header"));
323
+ }
324
+
325
+ #[test]
326
+ fn test_chunk_texts_batch_empty() {
327
+ let config = ChunkingConfig::default();
328
+ let texts: Vec<&str> = vec![];
329
+ let results = chunk_texts_batch(&texts, &config).unwrap();
330
+ assert_eq!(results.len(), 0);
331
+ }
332
+
333
+ #[test]
334
+ fn test_chunk_texts_batch_multiple() {
335
+ let config = ChunkingConfig {
336
+ max_characters: 30,
337
+ overlap: 5,
338
+ trim: true,
339
+ chunker_type: ChunkerType::Text,
340
+ };
341
+ let texts = vec!["First text", "Second text", "Third text"];
342
+ let results = chunk_texts_batch(&texts, &config).unwrap();
343
+ assert_eq!(results.len(), 3);
344
+ assert!(results.iter().all(|r| r.chunk_count >= 1));
345
+ }
346
+
347
+ #[test]
348
+ fn test_chunk_texts_batch_mixed_lengths() {
349
+ let config = ChunkingConfig {
350
+ max_characters: 20,
351
+ overlap: 5,
352
+ trim: true,
353
+ chunker_type: ChunkerType::Text,
354
+ };
355
+ let texts = vec![
356
+ "Short",
357
+ "This is a longer text that should be split into multiple chunks",
358
+ "",
359
+ ];
360
+ let results = chunk_texts_batch(&texts, &config).unwrap();
361
+ assert_eq!(results.len(), 3);
362
+ assert_eq!(results[0].chunk_count, 1);
363
+ assert!(results[1].chunk_count > 1);
364
+ assert_eq!(results[2].chunk_count, 0);
365
+ }
366
+
367
+ #[test]
368
+ fn test_chunk_texts_batch_error_propagation() {
369
+ let config = ChunkingConfig {
370
+ max_characters: 10,
371
+ overlap: 20,
372
+ trim: true,
373
+ chunker_type: ChunkerType::Text,
374
+ };
375
+ let texts = vec!["Text one", "Text two"];
376
+ let result = chunk_texts_batch(&texts, &config);
377
+ assert!(result.is_err());
378
+ }
379
+
380
+ #[test]
381
+ fn test_chunking_config_default() {
382
+ let config = ChunkingConfig::default();
383
+ assert_eq!(config.max_characters, 2000);
384
+ assert_eq!(config.overlap, 100);
385
+ assert!(config.trim);
386
+ assert_eq!(config.chunker_type, ChunkerType::Text);
387
+ }
388
+
389
+ #[test]
390
+ fn test_chunk_very_long_text() {
391
+ let config = ChunkingConfig {
392
+ max_characters: 100,
393
+ overlap: 20,
394
+ trim: true,
395
+ chunker_type: ChunkerType::Text,
396
+ };
397
+ let text = "a".repeat(1000);
398
+ let result = chunk_text(&text, &config, None).unwrap();
399
+ assert!(result.chunk_count >= 10);
400
+ assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 100));
401
+ }
402
+
403
+ #[test]
404
+ fn test_chunk_text_with_newlines() {
405
+ let config = ChunkingConfig {
406
+ max_characters: 30,
407
+ overlap: 5,
408
+ trim: true,
409
+ chunker_type: ChunkerType::Text,
410
+ };
411
+ let text = "Line one\nLine two\nLine three\nLine four\nLine five";
412
+ let result = chunk_text(text, &config, None).unwrap();
413
+ assert!(result.chunk_count >= 1);
414
+ }
415
+
416
+ #[test]
417
+ fn test_chunk_markdown_with_lists() {
418
+ let config = ChunkingConfig {
419
+ max_characters: 100,
420
+ overlap: 10,
421
+ trim: true,
422
+ chunker_type: ChunkerType::Markdown,
423
+ };
424
+ let markdown = "# List Example\n\n- Item 1\n- Item 2\n- Item 3\n\nMore text.";
425
+ let result = chunk_text(markdown, &config, None).unwrap();
426
+ assert!(result.chunk_count >= 1);
427
+ assert!(result.chunks.iter().any(|chunk| chunk.content.contains("- Item")));
428
+ }
429
+
430
+ #[test]
431
+ fn test_chunk_markdown_with_tables() {
432
+ let config = ChunkingConfig {
433
+ max_characters: 150,
434
+ overlap: 10,
435
+ trim: true,
436
+ chunker_type: ChunkerType::Markdown,
437
+ };
438
+ let markdown = "# Table\n\n| Col1 | Col2 |\n|------|------|\n| A | B |\n| C | D |";
439
+ let result = chunk_text(markdown, &config, None).unwrap();
440
+ assert!(result.chunk_count >= 1);
441
+ assert!(result.chunks.iter().any(|chunk| chunk.content.contains("|")));
442
+ }
443
+
444
+ #[test]
445
+ fn test_chunk_special_characters() {
446
+ let config = ChunkingConfig {
447
+ max_characters: 50,
448
+ overlap: 5,
449
+ trim: true,
450
+ chunker_type: ChunkerType::Text,
451
+ };
452
+ let text = "Special chars: @#$%^&*()[]{}|\\<>?/~`";
453
+ let result = chunk_text(text, &config, None).unwrap();
454
+ assert_eq!(result.chunk_count, 1);
455
+ assert!(result.chunks[0].content.contains("@#$%"));
456
+ }
457
+
458
+ #[test]
459
+ fn test_chunk_unicode_characters() {
460
+ let config = ChunkingConfig {
461
+ max_characters: 50,
462
+ overlap: 5,
463
+ trim: true,
464
+ chunker_type: ChunkerType::Text,
465
+ };
466
+ let text = "Unicode: 你好世界 🌍 café résumé";
467
+ let result = chunk_text(text, &config, None).unwrap();
468
+ assert_eq!(result.chunk_count, 1);
469
+ assert!(result.chunks[0].content.contains("你好"));
470
+ assert!(result.chunks[0].content.contains("🌍"));
471
+ }
472
+
473
+ #[test]
474
+ fn test_chunk_cjk_text() {
475
+ let config = ChunkingConfig {
476
+ max_characters: 30,
477
+ overlap: 5,
478
+ trim: true,
479
+ chunker_type: ChunkerType::Text,
480
+ };
481
+ let text = "日本語のテキストです。これは長い文章で、複数のチャンクに分割されるべきです。";
482
+ let result = chunk_text(text, &config, None).unwrap();
483
+ assert!(result.chunk_count >= 1);
484
+ }
485
+
486
+ #[test]
487
+ fn test_chunk_mixed_languages() {
488
+ let config = ChunkingConfig {
489
+ max_characters: 40,
490
+ overlap: 5,
491
+ trim: true,
492
+ chunker_type: ChunkerType::Text,
493
+ };
494
+ let text = "English text mixed with 中文文本 and some français";
495
+ let result = chunk_text(text, &config, None).unwrap();
496
+ assert!(result.chunk_count >= 1);
497
+ }
498
+
499
+ #[test]
500
+ fn test_chunk_offset_calculation_with_overlap() {
501
+ let config = ChunkingConfig {
502
+ max_characters: 20,
503
+ overlap: 5,
504
+ trim: false,
505
+ chunker_type: ChunkerType::Text,
506
+ };
507
+ let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
508
+ let result = chunk_text(text, &config, None).unwrap();
509
+
510
+ assert!(result.chunks.len() >= 2, "Expected at least 2 chunks");
511
+
512
+ for i in 0..result.chunks.len() {
513
+ let chunk = &result.chunks[i];
514
+ let metadata = &chunk.metadata;
515
+
516
+ assert_eq!(
517
+ metadata.byte_end - metadata.byte_start,
518
+ chunk.content.len(),
519
+ "Chunk {} offset range doesn't match content length",
520
+ i
521
+ );
522
+
523
+ assert_eq!(metadata.chunk_index, i);
524
+ assert_eq!(metadata.total_chunks, result.chunks.len());
525
+ }
526
+
527
+ for i in 0..result.chunks.len() - 1 {
528
+ let current_chunk = &result.chunks[i];
529
+ let next_chunk = &result.chunks[i + 1];
530
+
531
+ assert!(
532
+ next_chunk.metadata.byte_start < current_chunk.metadata.byte_end,
533
+ "Chunk {} and {} don't overlap: next starts at {} but current ends at {}",
534
+ i,
535
+ i + 1,
536
+ next_chunk.metadata.byte_start,
537
+ current_chunk.metadata.byte_end
538
+ );
539
+
540
+ let overlap_size = current_chunk.metadata.byte_end - next_chunk.metadata.byte_start;
541
+ assert!(
542
+ overlap_size <= config.overlap + 10,
543
+ "Overlap between chunks {} and {} is too large: {}",
544
+ i,
545
+ i + 1,
546
+ overlap_size
547
+ );
548
+ }
549
+ }
550
+
551
+ #[test]
552
+ fn test_chunk_offset_calculation_without_overlap() {
553
+ let config = ChunkingConfig {
554
+ max_characters: 20,
555
+ overlap: 0,
556
+ trim: false,
557
+ chunker_type: ChunkerType::Text,
558
+ };
559
+ let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
560
+ let result = chunk_text(text, &config, None).unwrap();
561
+
562
+ for i in 0..result.chunks.len() - 1 {
563
+ let current_chunk = &result.chunks[i];
564
+ let next_chunk = &result.chunks[i + 1];
565
+
566
+ assert!(
567
+ next_chunk.metadata.byte_start >= current_chunk.metadata.byte_end,
568
+ "Chunk {} and {} overlap when they shouldn't: next starts at {} but current ends at {}",
569
+ i,
570
+ i + 1,
571
+ next_chunk.metadata.byte_start,
572
+ current_chunk.metadata.byte_end
573
+ );
574
+ }
575
+ }
576
+
577
+ #[test]
578
+ fn test_chunk_offset_covers_full_text() {
579
+ let config = ChunkingConfig {
580
+ max_characters: 15,
581
+ overlap: 3,
582
+ trim: false,
583
+ chunker_type: ChunkerType::Text,
584
+ };
585
+ let text = "0123456789 ABCDEFGHIJ KLMNOPQRST UVWXYZ";
586
+ let result = chunk_text(text, &config, None).unwrap();
587
+
588
+ assert!(result.chunks.len() >= 2, "Expected multiple chunks");
589
+
590
+ assert_eq!(
591
+ result.chunks[0].metadata.byte_start, 0,
592
+ "First chunk should start at position 0"
593
+ );
594
+
595
+ for i in 0..result.chunks.len() - 1 {
596
+ let current_chunk = &result.chunks[i];
597
+ let next_chunk = &result.chunks[i + 1];
598
+
599
+ assert!(
600
+ next_chunk.metadata.byte_start <= current_chunk.metadata.byte_end,
601
+ "Gap detected between chunk {} (ends at {}) and chunk {} (starts at {})",
602
+ i,
603
+ current_chunk.metadata.byte_end,
604
+ i + 1,
605
+ next_chunk.metadata.byte_start
606
+ );
607
+ }
608
+ }
609
+
610
+ #[test]
611
+ fn test_chunk_offset_with_various_overlap_sizes() {
612
+ for overlap in [0, 5, 10, 20] {
613
+ let config = ChunkingConfig {
614
+ max_characters: 30,
615
+ overlap,
616
+ trim: false,
617
+ chunker_type: ChunkerType::Text,
618
+ };
619
+ let text = "Word ".repeat(30);
620
+ let result = chunk_text(&text, &config, None).unwrap();
621
+
622
+ for chunk in &result.chunks {
623
+ assert!(
624
+ chunk.metadata.byte_end > chunk.metadata.byte_start,
625
+ "Invalid offset range for overlap {}: start={}, end={}",
626
+ overlap,
627
+ chunk.metadata.byte_start,
628
+ chunk.metadata.byte_end
629
+ );
630
+ }
631
+
632
+ for chunk in &result.chunks {
633
+ assert!(
634
+ chunk.metadata.byte_start < text.len(),
635
+ "char_start with overlap {} is out of bounds: {}",
636
+ overlap,
637
+ chunk.metadata.byte_start
638
+ );
639
+ }
640
+ }
641
+ }
642
+
643
+ #[test]
644
+ fn test_chunk_last_chunk_offset() {
645
+ let config = ChunkingConfig {
646
+ max_characters: 20,
647
+ overlap: 5,
648
+ trim: false,
649
+ chunker_type: ChunkerType::Text,
650
+ };
651
+ let text = "AAAAA BBBBB CCCCC DDDDD EEEEE";
652
+ let result = chunk_text(text, &config, None).unwrap();
653
+
654
+ assert!(result.chunks.len() >= 2, "Need multiple chunks for this test");
655
+
656
+ let last_chunk = result.chunks.last().unwrap();
657
+ let second_to_last = &result.chunks[result.chunks.len() - 2];
658
+
659
+ assert!(
660
+ last_chunk.metadata.byte_start < second_to_last.metadata.byte_end,
661
+ "Last chunk should overlap with previous chunk"
662
+ );
663
+
664
+ let expected_end = text.len();
665
+ let last_chunk_covers_end =
666
+ last_chunk.content.trim_end() == text.trim_end() || last_chunk.metadata.byte_end >= expected_end - 5;
667
+ assert!(last_chunk_covers_end, "Last chunk should cover the end of the text");
668
+ }
669
+
670
+ #[test]
671
+ fn test_chunk_with_page_boundaries() {
672
+ let config = ChunkingConfig {
673
+ max_characters: 30,
674
+ overlap: 5,
675
+ trim: true,
676
+ chunker_type: ChunkerType::Text,
677
+ };
678
+ let text = "Page one content here. Page two starts here and continues.";
679
+
680
+ let boundaries = vec![
681
+ PageBoundary {
682
+ byte_start: 0,
683
+ byte_end: 21,
684
+ page_number: 1,
685
+ },
686
+ PageBoundary {
687
+ byte_start: 22,
688
+ byte_end: 58,
689
+ page_number: 2,
690
+ },
691
+ ];
692
+
693
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
694
+ assert!(result.chunks.len() >= 2);
695
+
696
+ assert_eq!(result.chunks[0].metadata.first_page, Some(1));
697
+
698
+ let last_chunk = result.chunks.last().unwrap();
699
+ assert_eq!(last_chunk.metadata.last_page, Some(2));
700
+ }
701
+
702
+ #[test]
703
+ fn test_chunk_without_page_boundaries() {
704
+ let config = ChunkingConfig {
705
+ max_characters: 30,
706
+ overlap: 5,
707
+ trim: true,
708
+ chunker_type: ChunkerType::Text,
709
+ };
710
+ let text = "This is some test content that should be split into multiple chunks.";
711
+
712
+ let result = chunk_text(text, &config, None).unwrap();
713
+ assert!(result.chunks.len() >= 2);
714
+
715
+ for chunk in &result.chunks {
716
+ assert_eq!(chunk.metadata.first_page, None);
717
+ assert_eq!(chunk.metadata.last_page, None);
718
+ }
719
+ }
720
+
721
+ #[test]
722
+ fn test_chunk_empty_boundaries() {
723
+ let config = ChunkingConfig {
724
+ max_characters: 30,
725
+ overlap: 5,
726
+ trim: true,
727
+ chunker_type: ChunkerType::Text,
728
+ };
729
+ let text = "Some text content here.";
730
+ let boundaries: Vec<PageBoundary> = vec![];
731
+
732
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
733
+ assert_eq!(result.chunks.len(), 1);
734
+
735
+ assert_eq!(result.chunks[0].metadata.first_page, None);
736
+ assert_eq!(result.chunks[0].metadata.last_page, None);
737
+ }
738
+
739
+ #[test]
740
+ fn test_chunk_spanning_multiple_pages() {
741
+ let config = ChunkingConfig {
742
+ max_characters: 50,
743
+ overlap: 5,
744
+ trim: false,
745
+ chunker_type: ChunkerType::Text,
746
+ };
747
+ let text = "0123456789 AAAAAAAAAA 1111111111 BBBBBBBBBB 2222222222";
748
+
749
+ let boundaries = vec![
750
+ PageBoundary {
751
+ byte_start: 0,
752
+ byte_end: 20,
753
+ page_number: 1,
754
+ },
755
+ PageBoundary {
756
+ byte_start: 20,
757
+ byte_end: 40,
758
+ page_number: 2,
759
+ },
760
+ PageBoundary {
761
+ byte_start: 40,
762
+ byte_end: 54,
763
+ page_number: 3,
764
+ },
765
+ ];
766
+
767
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
768
+ assert!(result.chunks.len() >= 2);
769
+
770
+ for chunk in &result.chunks {
771
+ assert!(chunk.metadata.first_page.is_some() || chunk.metadata.last_page.is_some());
772
+ }
773
+ }
774
+
775
+ #[test]
776
+ fn test_chunk_text_with_invalid_boundary_range() {
777
+ let config = ChunkingConfig {
778
+ max_characters: 30,
779
+ overlap: 5,
780
+ trim: true,
781
+ chunker_type: ChunkerType::Text,
782
+ };
783
+ let text = "Page one content here. Page two content.";
784
+
785
+ let boundaries = vec![PageBoundary {
786
+ byte_start: 10,
787
+ byte_end: 5,
788
+ page_number: 1,
789
+ }];
790
+
791
+ let result = chunk_text(text, &config, Some(&boundaries));
792
+ assert!(result.is_err());
793
+ let err = result.unwrap_err();
794
+ assert!(err.to_string().contains("Invalid boundary range"));
795
+ assert!(err.to_string().contains("byte_start"));
796
+ }
797
+
798
+ #[test]
799
+ fn test_chunk_text_with_unsorted_boundaries() {
800
+ let config = ChunkingConfig {
801
+ max_characters: 30,
802
+ overlap: 5,
803
+ trim: true,
804
+ chunker_type: ChunkerType::Text,
805
+ };
806
+ let text = "Page one content here. Page two content.";
807
+
808
+ let boundaries = vec![
809
+ PageBoundary {
810
+ byte_start: 22,
811
+ byte_end: 40,
812
+ page_number: 2,
813
+ },
814
+ PageBoundary {
815
+ byte_start: 0,
816
+ byte_end: 21,
817
+ page_number: 1,
818
+ },
819
+ ];
820
+
821
+ let result = chunk_text(text, &config, Some(&boundaries));
822
+ assert!(result.is_err());
823
+ let err = result.unwrap_err();
824
+ assert!(err.to_string().contains("not sorted"));
825
+ assert!(err.to_string().contains("boundaries"));
826
+ }
827
+
828
+ #[test]
829
+ fn test_chunk_text_with_overlapping_boundaries() {
830
+ let config = ChunkingConfig {
831
+ max_characters: 30,
832
+ overlap: 5,
833
+ trim: true,
834
+ chunker_type: ChunkerType::Text,
835
+ };
836
+ let text = "Page one content here. Page two content.";
837
+
838
+ let boundaries = vec![
839
+ PageBoundary {
840
+ byte_start: 0,
841
+ byte_end: 25,
842
+ page_number: 1,
843
+ },
844
+ PageBoundary {
845
+ byte_start: 20,
846
+ byte_end: 40,
847
+ page_number: 2,
848
+ },
849
+ ];
850
+
851
+ let result = chunk_text(text, &config, Some(&boundaries));
852
+ assert!(result.is_err());
853
+ let err = result.unwrap_err();
854
+ assert!(err.to_string().contains("Overlapping"));
855
+ assert!(err.to_string().contains("boundaries"));
856
+ }
857
+
858
+ #[test]
859
+ fn test_chunk_with_pages_basic() {
860
+ let config = ChunkingConfig {
861
+ max_characters: 25,
862
+ overlap: 5,
863
+ trim: true,
864
+ chunker_type: ChunkerType::Text,
865
+ };
866
+ let text = "First page content here.Second page content here.Third page.";
867
+
868
+ let boundaries = vec![
869
+ PageBoundary {
870
+ byte_start: 0,
871
+ byte_end: 24,
872
+ page_number: 1,
873
+ },
874
+ PageBoundary {
875
+ byte_start: 24,
876
+ byte_end: 50,
877
+ page_number: 2,
878
+ },
879
+ PageBoundary {
880
+ byte_start: 50,
881
+ byte_end: 60,
882
+ page_number: 3,
883
+ },
884
+ ];
885
+
886
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
887
+
888
+ if !result.chunks.is_empty() {
889
+ assert!(result.chunks[0].metadata.first_page.is_some());
890
+ }
891
+ }
892
+
893
+ #[test]
894
+ fn test_chunk_with_pages_single_page_chunk() {
895
+ let config = ChunkingConfig {
896
+ max_characters: 100,
897
+ overlap: 10,
898
+ trim: true,
899
+ chunker_type: ChunkerType::Text,
900
+ };
901
+ let text = "All content on single page fits in one chunk.";
902
+
903
+ let boundaries = vec![PageBoundary {
904
+ byte_start: 0,
905
+ byte_end: 45,
906
+ page_number: 1,
907
+ }];
908
+
909
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
910
+ assert_eq!(result.chunks.len(), 1);
911
+ assert_eq!(result.chunks[0].metadata.first_page, Some(1));
912
+ assert_eq!(result.chunks[0].metadata.last_page, Some(1));
913
+ }
914
+
915
+ #[test]
916
+ fn test_chunk_with_pages_no_overlap() {
917
+ let config = ChunkingConfig {
918
+ max_characters: 20,
919
+ overlap: 0,
920
+ trim: false,
921
+ chunker_type: ChunkerType::Text,
922
+ };
923
+ let text = "AAAAA BBBBB CCCCC DDDDD";
924
+
925
+ let boundaries = vec![
926
+ PageBoundary {
927
+ byte_start: 0,
928
+ byte_end: 11,
929
+ page_number: 1,
930
+ },
931
+ PageBoundary {
932
+ byte_start: 11,
933
+ byte_end: 23,
934
+ page_number: 2,
935
+ },
936
+ ];
937
+
938
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
939
+ assert!(!result.chunks.is_empty());
940
+
941
+ for chunk in &result.chunks {
942
+ if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
943
+ assert!(first <= last);
944
+ }
945
+ }
946
+ }
947
+
948
+ #[test]
949
+ fn test_chunk_metadata_page_range_accuracy() {
950
+ let config = ChunkingConfig {
951
+ max_characters: 30,
952
+ overlap: 5,
953
+ trim: true,
954
+ chunker_type: ChunkerType::Text,
955
+ };
956
+ let text = "Page One Content Here.Page Two.";
957
+
958
+ let boundaries = vec![
959
+ PageBoundary {
960
+ byte_start: 0,
961
+ byte_end: 21,
962
+ page_number: 1,
963
+ },
964
+ PageBoundary {
965
+ byte_start: 21,
966
+ byte_end: 31,
967
+ page_number: 2,
968
+ },
969
+ ];
970
+
971
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
972
+
973
+ for chunk in &result.chunks {
974
+ assert_eq!(chunk.metadata.byte_end - chunk.metadata.byte_start, chunk.content.len());
975
+ }
976
+ }
977
+
978
+ #[test]
979
+ fn test_chunk_page_range_boundary_edge_cases() {
980
+ let config = ChunkingConfig {
981
+ max_characters: 10,
982
+ overlap: 2,
983
+ trim: false,
984
+ chunker_type: ChunkerType::Text,
985
+ };
986
+ let text = "0123456789ABCDEFGHIJ";
987
+
988
+ let boundaries = vec![
989
+ PageBoundary {
990
+ byte_start: 0,
991
+ byte_end: 10,
992
+ page_number: 1,
993
+ },
994
+ PageBoundary {
995
+ byte_start: 10,
996
+ byte_end: 20,
997
+ page_number: 2,
998
+ },
999
+ ];
1000
+
1001
+ let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1002
+
1003
+ for chunk in &result.chunks {
1004
+ let on_page1 = chunk.metadata.byte_start < 10;
1005
+ let on_page2 = chunk.metadata.byte_end > 10;
1006
+
1007
+ if on_page1 && on_page2 {
1008
+ assert_eq!(chunk.metadata.first_page, Some(1));
1009
+ assert_eq!(chunk.metadata.last_page, Some(2));
1010
+ } else if on_page1 {
1011
+ assert_eq!(chunk.metadata.first_page, Some(1));
1012
+ } else if on_page2 {
1013
+ assert_eq!(chunk.metadata.first_page, Some(2));
1014
+ }
1015
+ }
1016
+ }
1017
+ }