kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -30,7 +30,7 @@
30
30
  //! };
31
31
  //!
32
32
  //! let long_text = "This is a very long document...".repeat(100);
33
- //! let result = chunk_text(&long_text, &config, None)?;
33
+ //! let result = chunk_text(&long_text, &config)?;
34
34
  //!
35
35
  //! println!("Split into {} chunks", result.chunk_count);
36
36
  //! for (i, chunk) in result.chunks.iter().enumerate() {
@@ -47,25 +47,10 @@
47
47
  //! - Processing large documents in batches
48
48
  //! - Maintaining context across chunk boundaries
49
49
  use crate::error::{KreuzbergError, Result};
50
- use crate::types::{Chunk, ChunkMetadata, PageBoundary};
51
- use bitvec::prelude::*;
52
- use once_cell::sync::Lazy;
50
+ use crate::types::{Chunk, ChunkMetadata};
53
51
  use serde::{Deserialize, Serialize};
54
- use std::sync::Arc;
55
52
  use text_splitter::{Characters, ChunkCapacity, ChunkConfig, MarkdownSplitter, TextSplitter};
56
53
 
57
- pub mod processor;
58
- pub use processor::ChunkingProcessor;
59
-
60
- /// Threshold below which we use O(1) direct validation instead of precomputing a BitVec.
61
- ///
62
- /// When there are 10 or fewer boundaries, the overhead of creating a BitVec (which is O(n)
63
- /// where n is the text length) exceeds the cost of calling `is_char_boundary()` directly
64
- /// for each boundary position. This threshold balances performance across different scenarios:
65
- /// - Small documents with few boundaries: fast path dominates
66
- /// - Large documents with many boundaries: batch path leverages the precomputed BitVec
67
- const ADAPTIVE_VALIDATION_THRESHOLD: usize = 10;
68
-
69
54
  #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
70
55
  pub enum ChunkerType {
71
56
  Text,
@@ -103,341 +88,7 @@ fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) -> Resu
103
88
  .map_err(|e| KreuzbergError::validation(format!("Invalid chunking configuration: {}", e)))
104
89
  }
105
90
 
106
- /// Pre-computes valid UTF-8 character boundaries for a text string.
107
- ///
108
- /// This function performs a single O(n) pass through the text to identify all valid
109
- /// UTF-8 character boundaries, storing them in a BitVec for O(1) lookups.
110
- ///
111
- /// # Arguments
112
- ///
113
- /// * `text` - The text to analyze
114
- ///
115
- /// # Returns
116
- ///
117
- /// A BitVec where each bit represents whether a byte offset is a valid UTF-8 character boundary.
118
- /// The BitVec has length `text.len() + 1` (includes the end position).
119
- ///
120
- /// # Examples
121
- ///
122
- /// ```ignore
123
- /// let text = "Hello 👋";
124
- /// let boundaries = precompute_utf8_boundaries(text);
125
- /// assert!(boundaries[0]); // Start is always valid
126
- /// assert!(boundaries[6]); // 'H' + "ello " = 6 bytes
127
- /// assert!(!boundaries[7]); // Middle of emoji (first byte of 4-byte sequence)
128
- /// assert!(boundaries[10]); // After emoji (valid boundary)
129
- /// ```
130
- fn precompute_utf8_boundaries(text: &str) -> BitVec {
131
- let text_len = text.len();
132
- let mut boundaries = bitvec![0; text_len + 1];
133
-
134
- boundaries.set(0, true);
135
-
136
- for (i, _) in text.char_indices() {
137
- if i <= text_len {
138
- boundaries.set(i, true);
139
- }
140
- }
141
-
142
- if text_len > 0 {
143
- boundaries.set(text_len, true);
144
- }
145
-
146
- boundaries
147
- }
148
-
149
- /// Validates that byte offsets in page boundaries fall on valid UTF-8 character boundaries.
150
- ///
151
- /// This function ensures that all page boundary positions are at valid UTF-8 character
152
- /// boundaries within the text. This is CRITICAL to prevent text corruption when boundaries
153
- /// are created from language bindings or external sources, particularly with multibyte
154
- /// UTF-8 characters (emoji, CJK characters, combining marks, etc.).
155
- ///
156
- /// **Performance Strategy**: Uses adaptive validation to optimize for different boundary counts:
157
- /// - **Small sets (≤10 boundaries)**: O(k) approach using Rust's native `is_char_boundary()` for each position
158
- /// - **Large sets (>10 boundaries)**: O(n) precomputation with O(1) lookups via BitVec
159
- ///
160
- /// For typical PDF documents with 1-10 page boundaries, the fast path provides 30-50% faster
161
- /// validation than always precomputing. For documents with 100+ boundaries, batch precomputation
162
- /// is 2-4% faster overall due to amortized costs. This gives ~2-4% improvement across all scenarios.
163
- ///
164
- /// # Arguments
165
- ///
166
- /// * `text` - The text being chunked
167
- /// * `boundaries` - Page boundary markers to validate
168
- ///
169
- /// # Returns
170
- ///
171
- /// Returns `Ok(())` if all boundaries are at valid UTF-8 character boundaries.
172
- /// Returns `KreuzbergError::Validation` if any boundary is at an invalid position.
173
- ///
174
- /// # UTF-8 Boundary Safety
175
- ///
176
- /// Rust strings use UTF-8 encoding where characters can be 1-4 bytes. For example:
177
- /// - ASCII letters: 1 byte each
178
- /// - Emoji (🌍): 4 bytes but 1 character
179
- /// - CJK characters (中): 3 bytes but 1 character
180
- ///
181
- /// This function checks that all byte_start and byte_end values are at character boundaries
182
- /// using an adaptive strategy: direct calls for small boundary sets, or precomputed BitVec
183
- /// for large sets.
184
- fn validate_utf8_boundaries(text: &str, boundaries: &[PageBoundary]) -> Result<()> {
185
- if boundaries.is_empty() {
186
- return Ok(());
187
- }
188
-
189
- let text_len = text.len();
190
-
191
- if boundaries.len() <= ADAPTIVE_VALIDATION_THRESHOLD {
192
- validate_utf8_boundaries_fast_path(text, boundaries, text_len)
193
- } else {
194
- validate_utf8_boundaries_batch_path(text, boundaries, text_len)
195
- }
196
- }
197
-
198
- /// Fast path: direct UTF-8 boundary validation for small boundary counts (≤10).
199
- ///
200
- /// Uses Rust's native `str::is_char_boundary()` for O(1) checks on each boundary position.
201
- /// This avoids the O(n) overhead of BitVec precomputation, making it ideal for typical
202
- /// PDF documents with few page boundaries.
203
- ///
204
- /// # Arguments
205
- ///
206
- /// * `text` - The text being validated
207
- /// * `boundaries` - Page boundary markers to validate
208
- /// * `text_len` - Pre-computed text length (avoids recomputation)
209
- ///
210
- /// # Returns
211
- ///
212
- /// Returns `Ok(())` if all boundaries are at valid UTF-8 character boundaries.
213
- /// Returns `KreuzbergError::Validation` if any boundary is invalid.
214
- fn validate_utf8_boundaries_fast_path(text: &str, boundaries: &[PageBoundary], text_len: usize) -> Result<()> {
215
- for (idx, boundary) in boundaries.iter().enumerate() {
216
- if boundary.byte_start > text_len {
217
- return Err(KreuzbergError::validation(format!(
218
- "Page boundary {} has byte_start={} which exceeds text length {}",
219
- idx, boundary.byte_start, text_len
220
- )));
221
- }
222
-
223
- if boundary.byte_end > text_len {
224
- return Err(KreuzbergError::validation(format!(
225
- "Page boundary {} has byte_end={} which exceeds text length {}",
226
- idx, boundary.byte_end, text_len
227
- )));
228
- }
229
-
230
- if boundary.byte_start > 0 && boundary.byte_start < text_len && !text.is_char_boundary(boundary.byte_start) {
231
- return Err(KreuzbergError::validation(format!(
232
- "Page boundary {} has byte_start={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
233
- idx, boundary.byte_start, text_len
234
- )));
235
- }
236
-
237
- if boundary.byte_end > 0 && boundary.byte_end < text_len && !text.is_char_boundary(boundary.byte_end) {
238
- return Err(KreuzbergError::validation(format!(
239
- "Page boundary {} has byte_end={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
240
- idx, boundary.byte_end, text_len
241
- )));
242
- }
243
- }
244
-
245
- Ok(())
246
- }
247
-
248
- /// Batch path: precomputed BitVec validation for large boundary counts (>10).
249
- ///
250
- /// Precomputes all valid UTF-8 boundaries in a single O(n) pass, then performs O(1)
251
- /// lookups for each boundary position. This is more efficient than O(k*1) direct checks
252
- /// when k is large or when the repeated `is_char_boundary()` calls have measurable overhead.
253
- ///
254
- /// # Arguments
255
- ///
256
- /// * `text` - The text being validated
257
- /// * `boundaries` - Page boundary markers to validate
258
- /// * `text_len` - Pre-computed text length (avoids recomputation)
259
- ///
260
- /// # Returns
261
- ///
262
- /// Returns `Ok(())` if all boundaries are at valid UTF-8 character boundaries.
263
- /// Returns `KreuzbergError::Validation` if any boundary is invalid.
264
- fn validate_utf8_boundaries_batch_path(text: &str, boundaries: &[PageBoundary], text_len: usize) -> Result<()> {
265
- let valid_boundaries = precompute_utf8_boundaries(text);
266
-
267
- for (idx, boundary) in boundaries.iter().enumerate() {
268
- if boundary.byte_start > text_len {
269
- return Err(KreuzbergError::validation(format!(
270
- "Page boundary {} has byte_start={} which exceeds text length {}",
271
- idx, boundary.byte_start, text_len
272
- )));
273
- }
274
-
275
- if boundary.byte_end > text_len {
276
- return Err(KreuzbergError::validation(format!(
277
- "Page boundary {} has byte_end={} which exceeds text length {}",
278
- idx, boundary.byte_end, text_len
279
- )));
280
- }
281
-
282
- if boundary.byte_start > 0 && boundary.byte_start <= text_len && !valid_boundaries[boundary.byte_start] {
283
- return Err(KreuzbergError::validation(format!(
284
- "Page boundary {} has byte_start={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
285
- idx, boundary.byte_start, text_len
286
- )));
287
- }
288
-
289
- if boundary.byte_end > 0 && boundary.byte_end <= text_len && !valid_boundaries[boundary.byte_end] {
290
- return Err(KreuzbergError::validation(format!(
291
- "Page boundary {} has byte_end={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
292
- idx, boundary.byte_end, text_len
293
- )));
294
- }
295
- }
296
-
297
- Ok(())
298
- }
299
-
300
- /// Calculate which pages a character range spans.
301
- ///
302
- /// # Arguments
303
- ///
304
- /// * `char_start` - Starting character offset of the chunk
305
- /// * `char_end` - Ending character offset of the chunk
306
- /// * `boundaries` - Page boundary markers from the document
307
- ///
308
- /// # Returns
309
- ///
310
- /// A tuple of (first_page, last_page) where page numbers are 1-indexed.
311
- /// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
312
- /// Validates page boundaries for consistency and correctness.
313
- ///
314
- /// # Validation Rules
315
- ///
316
- /// 1. Boundaries must be sorted by char_start (monotonically increasing)
317
- /// 2. Boundaries must not overlap (char_end[i] <= char_start[i+1])
318
- /// 3. Each boundary must have char_start < char_end
319
- ///
320
- /// # Errors
321
- ///
322
- /// Returns `KreuzbergError::Validation` if any boundary is invalid.
323
- fn validate_page_boundaries(boundaries: &[PageBoundary]) -> Result<()> {
324
- if boundaries.is_empty() {
325
- return Ok(());
326
- }
327
-
328
- for (idx, boundary) in boundaries.iter().enumerate() {
329
- if boundary.byte_start >= boundary.byte_end {
330
- return Err(KreuzbergError::validation(format!(
331
- "Invalid boundary range at index {}: byte_start ({}) must be < byte_end ({})",
332
- idx, boundary.byte_start, boundary.byte_end
333
- )));
334
- }
335
- }
336
-
337
- for i in 0..boundaries.len() - 1 {
338
- let current = &boundaries[i];
339
- let next = &boundaries[i + 1];
340
-
341
- if current.byte_start > next.byte_start {
342
- return Err(KreuzbergError::validation(format!(
343
- "Page boundaries not sorted: boundary at index {} (byte_start={}) comes after boundary at index {} (byte_start={})",
344
- i,
345
- current.byte_start,
346
- i + 1,
347
- next.byte_start
348
- )));
349
- }
350
-
351
- if current.byte_end > next.byte_start {
352
- return Err(KreuzbergError::validation(format!(
353
- "Overlapping page boundaries: boundary {} ends at {} but boundary {} starts at {}",
354
- i,
355
- current.byte_end,
356
- i + 1,
357
- next.byte_start
358
- )));
359
- }
360
- }
361
-
362
- Ok(())
363
- }
364
-
365
- /// Calculate which pages a byte range spans.
366
- ///
367
- /// # Arguments
368
- ///
369
- /// * `byte_start` - Starting byte offset of the chunk
370
- /// * `byte_end` - Ending byte offset of the chunk
371
- /// * `boundaries` - Page boundary markers from the document
372
- ///
373
- /// # Returns
374
- ///
375
- /// A tuple of (first_page, last_page) where page numbers are 1-indexed.
376
- /// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
377
- ///
378
- /// # Errors
379
- ///
380
- /// Returns `KreuzbergError::Validation` if boundaries are invalid.
381
- fn calculate_page_range(
382
- byte_start: usize,
383
- byte_end: usize,
384
- boundaries: &[PageBoundary],
385
- ) -> Result<(Option<usize>, Option<usize>)> {
386
- if boundaries.is_empty() {
387
- return Ok((None, None));
388
- }
389
-
390
- validate_page_boundaries(boundaries)?;
391
-
392
- let mut first_page = None;
393
- let mut last_page = None;
394
-
395
- for boundary in boundaries {
396
- if byte_start < boundary.byte_end && byte_end > boundary.byte_start {
397
- if first_page.is_none() {
398
- first_page = Some(boundary.page_number);
399
- }
400
- last_page = Some(boundary.page_number);
401
- }
402
- }
403
-
404
- Ok((first_page, last_page))
405
- }
406
-
407
- /// Split text into chunks with optional page boundary tracking.
408
- ///
409
- /// # Arguments
410
- ///
411
- /// * `text` - The text to split into chunks
412
- /// * `config` - Chunking configuration (max size, overlap, type)
413
- /// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
414
- ///
415
- /// # Returns
416
- ///
417
- /// A ChunkingResult containing all chunks and their metadata.
418
- ///
419
- /// # Examples
420
- ///
421
- /// ```rust
422
- /// use kreuzberg::chunking::{chunk_text, ChunkingConfig, ChunkerType};
423
- ///
424
- /// # fn example() -> kreuzberg::Result<()> {
425
- /// let config = ChunkingConfig {
426
- /// max_characters: 500,
427
- /// overlap: 50,
428
- /// trim: true,
429
- /// chunker_type: ChunkerType::Text,
430
- /// };
431
- /// let result = chunk_text("Long text...", &config, None)?;
432
- /// assert!(!result.chunks.is_empty());
433
- /// # Ok(())
434
- /// # }
435
- /// ```
436
- pub fn chunk_text(
437
- text: &str,
438
- config: &ChunkingConfig,
439
- page_boundaries: Option<&[PageBoundary]>,
440
- ) -> Result<ChunkingResult> {
91
+ pub fn chunk_text(text: &str, config: &ChunkingConfig) -> Result<ChunkingResult> {
441
92
  if text.is_empty() {
442
93
  return Ok(ChunkingResult {
443
94
  chunks: vec![],
@@ -445,10 +96,6 @@ pub fn chunk_text(
445
96
  });
446
97
  }
447
98
 
448
- if let Some(boundaries) = page_boundaries {
449
- validate_utf8_boundaries(text, boundaries)?;
450
- }
451
-
452
99
  let chunk_config = build_chunk_config(config.max_characters, config.overlap, config.trim)?;
453
100
 
454
101
  let text_chunks: Vec<&str> = match config.chunker_type {
@@ -463,42 +110,36 @@ pub fn chunk_text(
463
110
  };
464
111
 
465
112
  let total_chunks = text_chunks.len();
466
- let mut byte_offset = 0;
467
-
468
- let mut chunks: Vec<Chunk> = Vec::new();
469
-
470
- for (index, chunk_text) in text_chunks.into_iter().enumerate() {
471
- let byte_start = byte_offset;
472
- let chunk_length = chunk_text.len();
473
- let byte_end = byte_start + chunk_length;
474
-
475
- let overlap_chars = if index < total_chunks - 1 {
476
- config.overlap.min(chunk_length)
477
- } else {
478
- 0
479
- };
480
- byte_offset = byte_end - overlap_chars;
481
-
482
- let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
483
- calculate_page_range(byte_start, byte_end, boundaries)?
484
- } else {
485
- (None, None)
486
- };
487
-
488
- chunks.push(Chunk {
489
- content: chunk_text.to_string(),
490
- embedding: None,
491
- metadata: ChunkMetadata {
492
- byte_start,
493
- byte_end,
494
- token_count: None,
495
- chunk_index: index,
496
- total_chunks,
497
- first_page,
498
- last_page,
499
- },
500
- });
501
- }
113
+ let mut char_offset = 0;
114
+
115
+ let chunks: Vec<Chunk> = text_chunks
116
+ .into_iter()
117
+ .enumerate()
118
+ .map(|(index, chunk_text)| {
119
+ let char_start = char_offset;
120
+ let chunk_length = chunk_text.chars().count();
121
+ let char_end = char_start + chunk_length;
122
+
123
+ let overlap_chars = if index < total_chunks - 1 {
124
+ config.overlap.min(chunk_length)
125
+ } else {
126
+ 0
127
+ };
128
+ char_offset = char_end - overlap_chars;
129
+
130
+ Chunk {
131
+ content: chunk_text.to_string(),
132
+ embedding: None,
133
+ metadata: ChunkMetadata {
134
+ char_start,
135
+ char_end,
136
+ token_count: None,
137
+ chunk_index: index,
138
+ total_chunks,
139
+ },
140
+ }
141
+ })
142
+ .collect();
502
143
 
503
144
  let chunk_count = chunks.len();
504
145
 
@@ -518,49 +159,11 @@ pub fn chunk_text_with_type(
518
159
  trim,
519
160
  chunker_type,
520
161
  };
521
- chunk_text(text, &config, None)
162
+ chunk_text(text, &config)
522
163
  }
523
164
 
524
165
  pub fn chunk_texts_batch(texts: &[&str], config: &ChunkingConfig) -> Result<Vec<ChunkingResult>> {
525
- texts.iter().map(|text| chunk_text(text, config, None)).collect()
526
- }
527
-
528
- /// Lazy-initialized flag that ensures chunking processor is registered exactly once.
529
- ///
530
- /// This static is accessed on first use to automatically register the
531
- /// chunking processor with the plugin registry.
532
- static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_chunking_processor);
533
-
534
- /// Ensure the chunking processor is registered.
535
- ///
536
- /// This function is called automatically when needed.
537
- /// It's safe to call multiple times - registration only happens once.
538
- pub fn ensure_initialized() -> Result<()> {
539
- PROCESSOR_INITIALIZED
540
- .as_ref()
541
- .map(|_| ())
542
- .map_err(|e| crate::KreuzbergError::Plugin {
543
- message: format!("Failed to register chunking processor: {}", e),
544
- plugin_name: "text-chunking".to_string(),
545
- })
546
- }
547
-
548
- /// Register the chunking processor with the global registry.
549
- ///
550
- /// This function should be called once at application startup to register
551
- /// the chunking post-processor.
552
- ///
553
- /// **Note:** This is called automatically on first use.
554
- /// Explicit calling is optional.
555
- pub fn register_chunking_processor() -> Result<()> {
556
- let registry = crate::plugins::registry::get_post_processor_registry();
557
- let mut registry = registry
558
- .write()
559
- .map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
560
-
561
- registry.register(Arc::new(ChunkingProcessor), 50)?;
562
-
563
- Ok(())
166
+ texts.iter().map(|text| chunk_text(text, config)).collect()
564
167
  }
565
168
 
566
169
  #[cfg(test)]
@@ -570,7 +173,7 @@ mod tests {
570
173
  #[test]
571
174
  fn test_chunk_empty_text() {
572
175
  let config = ChunkingConfig::default();
573
- let result = chunk_text("", &config, None).unwrap();
176
+ let result = chunk_text("", &config).unwrap();
574
177
  assert_eq!(result.chunks.len(), 0);
575
178
  assert_eq!(result.chunk_count, 0);
576
179
  }
@@ -584,7 +187,7 @@ mod tests {
584
187
  chunker_type: ChunkerType::Text,
585
188
  };
586
189
  let text = "This is a short text.";
587
- let result = chunk_text(text, &config, None).unwrap();
190
+ let result = chunk_text(text, &config).unwrap();
588
191
  assert_eq!(result.chunks.len(), 1);
589
192
  assert_eq!(result.chunk_count, 1);
590
193
  assert_eq!(result.chunks[0].content, text);
@@ -599,7 +202,7 @@ mod tests {
599
202
  chunker_type: ChunkerType::Text,
600
203
  };
601
204
  let text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
602
- let result = chunk_text(text, &config, None).unwrap();
205
+ let result = chunk_text(text, &config).unwrap();
603
206
  assert!(result.chunk_count >= 2);
604
207
  assert_eq!(result.chunks.len(), result.chunk_count);
605
208
  assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 20));
@@ -614,7 +217,7 @@ mod tests {
614
217
  chunker_type: ChunkerType::Text,
615
218
  };
616
219
  let text = "abcdefghijklmnopqrstuvwxyz0123456789";
617
- let result = chunk_text(text, &config, None).unwrap();
220
+ let result = chunk_text(text, &config).unwrap();
618
221
  assert!(result.chunk_count >= 2);
619
222
 
620
223
  if result.chunks.len() >= 2 {
@@ -637,7 +240,7 @@ mod tests {
637
240
  chunker_type: ChunkerType::Markdown,
638
241
  };
639
242
  let markdown = "# Title\n\nParagraph one.\n\n## Section\n\nParagraph two.";
640
- let result = chunk_text(markdown, &config, None).unwrap();
243
+ let result = chunk_text(markdown, &config).unwrap();
641
244
  assert!(result.chunk_count >= 1);
642
245
  assert!(result.chunks.iter().any(|chunk| chunk.content.contains("# Title")));
643
246
  }
@@ -651,7 +254,7 @@ mod tests {
651
254
  chunker_type: ChunkerType::Markdown,
652
255
  };
653
256
  let markdown = "# Code Example\n\n```python\nprint('hello')\n```\n\nSome text after code.";
654
- let result = chunk_text(markdown, &config, None).unwrap();
257
+ let result = chunk_text(markdown, &config).unwrap();
655
258
  assert!(result.chunk_count >= 1);
656
259
  assert!(result.chunks.iter().any(|chunk| chunk.content.contains("```")));
657
260
  }
@@ -665,7 +268,7 @@ mod tests {
665
268
  chunker_type: ChunkerType::Markdown,
666
269
  };
667
270
  let markdown = "Check out [this link](https://example.com) for more info.";
668
- let result = chunk_text(markdown, &config, None).unwrap();
271
+ let result = chunk_text(markdown, &config).unwrap();
669
272
  assert_eq!(result.chunk_count, 1);
670
273
  assert!(result.chunks[0].content.contains("[this link]"));
671
274
  }
@@ -679,7 +282,7 @@ mod tests {
679
282
  chunker_type: ChunkerType::Text,
680
283
  };
681
284
  let text = " Leading and trailing spaces should be trimmed ";
682
- let result = chunk_text(text, &config, None).unwrap();
285
+ let result = chunk_text(text, &config).unwrap();
683
286
  assert!(result.chunk_count >= 1);
684
287
  assert!(result.chunks.iter().all(|chunk| !chunk.content.starts_with(' ')));
685
288
  }
@@ -693,7 +296,7 @@ mod tests {
693
296
  chunker_type: ChunkerType::Text,
694
297
  };
695
298
  let text = " Text with spaces ";
696
- let result = chunk_text(text, &config, None).unwrap();
299
+ let result = chunk_text(text, &config).unwrap();
697
300
  assert_eq!(result.chunk_count, 1);
698
301
  assert!(result.chunks[0].content.starts_with(' ') || result.chunks[0].content.len() < text.len());
699
302
  }
@@ -706,7 +309,7 @@ mod tests {
706
309
  trim: true,
707
310
  chunker_type: ChunkerType::Text,
708
311
  };
709
- let result = chunk_text("Some text", &config, None);
312
+ let result = chunk_text("Some text", &config);
710
313
  assert!(result.is_err());
711
314
  let err = result.unwrap_err();
712
315
  assert!(matches!(err, KreuzbergError::Validation { .. }));
@@ -800,7 +403,7 @@ mod tests {
800
403
  chunker_type: ChunkerType::Text,
801
404
  };
802
405
  let text = "a".repeat(1000);
803
- let result = chunk_text(&text, &config, None).unwrap();
406
+ let result = chunk_text(&text, &config).unwrap();
804
407
  assert!(result.chunk_count >= 10);
805
408
  assert!(result.chunks.iter().all(|chunk| chunk.content.len() <= 100));
806
409
  }
@@ -814,7 +417,7 @@ mod tests {
814
417
  chunker_type: ChunkerType::Text,
815
418
  };
816
419
  let text = "Line one\nLine two\nLine three\nLine four\nLine five";
817
- let result = chunk_text(text, &config, None).unwrap();
420
+ let result = chunk_text(text, &config).unwrap();
818
421
  assert!(result.chunk_count >= 1);
819
422
  }
820
423
 
@@ -827,7 +430,7 @@ mod tests {
827
430
  chunker_type: ChunkerType::Markdown,
828
431
  };
829
432
  let markdown = "# List Example\n\n- Item 1\n- Item 2\n- Item 3\n\nMore text.";
830
- let result = chunk_text(markdown, &config, None).unwrap();
433
+ let result = chunk_text(markdown, &config).unwrap();
831
434
  assert!(result.chunk_count >= 1);
832
435
  assert!(result.chunks.iter().any(|chunk| chunk.content.contains("- Item")));
833
436
  }
@@ -841,7 +444,7 @@ mod tests {
841
444
  chunker_type: ChunkerType::Markdown,
842
445
  };
843
446
  let markdown = "# Table\n\n| Col1 | Col2 |\n|------|------|\n| A | B |\n| C | D |";
844
- let result = chunk_text(markdown, &config, None).unwrap();
447
+ let result = chunk_text(markdown, &config).unwrap();
845
448
  assert!(result.chunk_count >= 1);
846
449
  assert!(result.chunks.iter().any(|chunk| chunk.content.contains("|")));
847
450
  }
@@ -855,7 +458,7 @@ mod tests {
855
458
  chunker_type: ChunkerType::Text,
856
459
  };
857
460
  let text = "Special chars: @#$%^&*()[]{}|\\<>?/~`";
858
- let result = chunk_text(text, &config, None).unwrap();
461
+ let result = chunk_text(text, &config).unwrap();
859
462
  assert_eq!(result.chunk_count, 1);
860
463
  assert!(result.chunks[0].content.contains("@#$%"));
861
464
  }
@@ -869,7 +472,7 @@ mod tests {
869
472
  chunker_type: ChunkerType::Text,
870
473
  };
871
474
  let text = "Unicode: 你好世界 🌍 café résumé";
872
- let result = chunk_text(text, &config, None).unwrap();
475
+ let result = chunk_text(text, &config).unwrap();
873
476
  assert_eq!(result.chunk_count, 1);
874
477
  assert!(result.chunks[0].content.contains("你好"));
875
478
  assert!(result.chunks[0].content.contains("🌍"));
@@ -884,7 +487,7 @@ mod tests {
884
487
  chunker_type: ChunkerType::Text,
885
488
  };
886
489
  let text = "日本語のテキストです。これは長い文章で、複数のチャンクに分割されるべきです。";
887
- let result = chunk_text(text, &config, None).unwrap();
490
+ let result = chunk_text(text, &config).unwrap();
888
491
  assert!(result.chunk_count >= 1);
889
492
  }
890
493
 
@@ -897,7 +500,7 @@ mod tests {
897
500
  chunker_type: ChunkerType::Text,
898
501
  };
899
502
  let text = "English text mixed with 中文文本 and some français";
900
- let result = chunk_text(text, &config, None).unwrap();
503
+ let result = chunk_text(text, &config).unwrap();
901
504
  assert!(result.chunk_count >= 1);
902
505
  }
903
506
 
@@ -910,7 +513,7 @@ mod tests {
910
513
  chunker_type: ChunkerType::Text,
911
514
  };
912
515
  let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
913
- let result = chunk_text(text, &config, None).unwrap();
516
+ let result = chunk_text(text, &config).unwrap();
914
517
 
915
518
  assert!(result.chunks.len() >= 2, "Expected at least 2 chunks");
916
519
 
@@ -919,8 +522,8 @@ mod tests {
919
522
  let metadata = &chunk.metadata;
920
523
 
921
524
  assert_eq!(
922
- metadata.byte_end - metadata.byte_start,
923
- chunk.content.len(),
525
+ metadata.char_end - metadata.char_start,
526
+ chunk.content.chars().count(),
924
527
  "Chunk {} offset range doesn't match content length",
925
528
  i
926
529
  );
@@ -934,15 +537,15 @@ mod tests {
934
537
  let next_chunk = &result.chunks[i + 1];
935
538
 
936
539
  assert!(
937
- next_chunk.metadata.byte_start < current_chunk.metadata.byte_end,
540
+ next_chunk.metadata.char_start < current_chunk.metadata.char_end,
938
541
  "Chunk {} and {} don't overlap: next starts at {} but current ends at {}",
939
542
  i,
940
543
  i + 1,
941
- next_chunk.metadata.byte_start,
942
- current_chunk.metadata.byte_end
544
+ next_chunk.metadata.char_start,
545
+ current_chunk.metadata.char_end
943
546
  );
944
547
 
945
- let overlap_size = current_chunk.metadata.byte_end - next_chunk.metadata.byte_start;
548
+ let overlap_size = current_chunk.metadata.char_end - next_chunk.metadata.char_start;
946
549
  assert!(
947
550
  overlap_size <= config.overlap + 10,
948
551
  "Overlap between chunks {} and {} is too large: {}",
@@ -962,19 +565,19 @@ mod tests {
962
565
  chunker_type: ChunkerType::Text,
963
566
  };
964
567
  let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
965
- let result = chunk_text(text, &config, None).unwrap();
568
+ let result = chunk_text(text, &config).unwrap();
966
569
 
967
570
  for i in 0..result.chunks.len() - 1 {
968
571
  let current_chunk = &result.chunks[i];
969
572
  let next_chunk = &result.chunks[i + 1];
970
573
 
971
574
  assert!(
972
- next_chunk.metadata.byte_start >= current_chunk.metadata.byte_end,
575
+ next_chunk.metadata.char_start >= current_chunk.metadata.char_end,
973
576
  "Chunk {} and {} overlap when they shouldn't: next starts at {} but current ends at {}",
974
577
  i,
975
578
  i + 1,
976
- next_chunk.metadata.byte_start,
977
- current_chunk.metadata.byte_end
579
+ next_chunk.metadata.char_start,
580
+ current_chunk.metadata.char_end
978
581
  );
979
582
  }
980
583
  }
@@ -988,12 +591,12 @@ mod tests {
988
591
  chunker_type: ChunkerType::Text,
989
592
  };
990
593
  let text = "0123456789 ABCDEFGHIJ KLMNOPQRST UVWXYZ";
991
- let result = chunk_text(text, &config, None).unwrap();
594
+ let result = chunk_text(text, &config).unwrap();
992
595
 
993
596
  assert!(result.chunks.len() >= 2, "Expected multiple chunks");
994
597
 
995
598
  assert_eq!(
996
- result.chunks[0].metadata.byte_start, 0,
599
+ result.chunks[0].metadata.char_start, 0,
997
600
  "First chunk should start at position 0"
998
601
  );
999
602
 
@@ -1002,12 +605,12 @@ mod tests {
1002
605
  let next_chunk = &result.chunks[i + 1];
1003
606
 
1004
607
  assert!(
1005
- next_chunk.metadata.byte_start <= current_chunk.metadata.byte_end,
608
+ next_chunk.metadata.char_start <= current_chunk.metadata.char_end,
1006
609
  "Gap detected between chunk {} (ends at {}) and chunk {} (starts at {})",
1007
610
  i,
1008
- current_chunk.metadata.byte_end,
611
+ current_chunk.metadata.char_end,
1009
612
  i + 1,
1010
- next_chunk.metadata.byte_start
613
+ next_chunk.metadata.char_start
1011
614
  );
1012
615
  }
1013
616
  }
@@ -1022,24 +625,24 @@ mod tests {
1022
625
  chunker_type: ChunkerType::Text,
1023
626
  };
1024
627
  let text = "Word ".repeat(30);
1025
- let result = chunk_text(&text, &config, None).unwrap();
628
+ let result = chunk_text(&text, &config).unwrap();
1026
629
 
1027
630
  for chunk in &result.chunks {
1028
631
  assert!(
1029
- chunk.metadata.byte_end > chunk.metadata.byte_start,
632
+ chunk.metadata.char_end > chunk.metadata.char_start,
1030
633
  "Invalid offset range for overlap {}: start={}, end={}",
1031
634
  overlap,
1032
- chunk.metadata.byte_start,
1033
- chunk.metadata.byte_end
635
+ chunk.metadata.char_start,
636
+ chunk.metadata.char_end
1034
637
  );
1035
638
  }
1036
639
 
1037
640
  for chunk in &result.chunks {
1038
641
  assert!(
1039
- chunk.metadata.byte_start < text.len(),
642
+ chunk.metadata.char_start < text.chars().count(),
1040
643
  "char_start with overlap {} is out of bounds: {}",
1041
644
  overlap,
1042
- chunk.metadata.byte_start
645
+ chunk.metadata.char_start
1043
646
  );
1044
647
  }
1045
648
  }
@@ -1054,7 +657,7 @@ mod tests {
1054
657
  chunker_type: ChunkerType::Text,
1055
658
  };
1056
659
  let text = "AAAAA BBBBB CCCCC DDDDD EEEEE";
1057
- let result = chunk_text(text, &config, None).unwrap();
660
+ let result = chunk_text(text, &config).unwrap();
1058
661
 
1059
662
  assert!(result.chunks.len() >= 2, "Need multiple chunks for this test");
1060
663
 
@@ -1062,1242 +665,13 @@ mod tests {
1062
665
  let second_to_last = &result.chunks[result.chunks.len() - 2];
1063
666
 
1064
667
  assert!(
1065
- last_chunk.metadata.byte_start < second_to_last.metadata.byte_end,
668
+ last_chunk.metadata.char_start < second_to_last.metadata.char_end,
1066
669
  "Last chunk should overlap with previous chunk"
1067
670
  );
1068
671
 
1069
- let expected_end = text.len();
672
+ let expected_end = text.chars().count();
1070
673
  let last_chunk_covers_end =
1071
- last_chunk.content.trim_end() == text.trim_end() || last_chunk.metadata.byte_end >= expected_end - 5;
674
+ last_chunk.content.trim_end() == text.trim_end() || last_chunk.metadata.char_end >= expected_end - 5;
1072
675
  assert!(last_chunk_covers_end, "Last chunk should cover the end of the text");
1073
676
  }
1074
-
1075
- #[test]
1076
- fn test_chunk_with_page_boundaries() {
1077
- use crate::types::PageBoundary;
1078
-
1079
- let config = ChunkingConfig {
1080
- max_characters: 30,
1081
- overlap: 5,
1082
- trim: true,
1083
- chunker_type: ChunkerType::Text,
1084
- };
1085
- let text = "Page one content here. Page two starts here and continues.";
1086
-
1087
- let boundaries = vec![
1088
- PageBoundary {
1089
- byte_start: 0,
1090
- byte_end: 21,
1091
- page_number: 1,
1092
- },
1093
- PageBoundary {
1094
- byte_start: 22,
1095
- byte_end: 58,
1096
- page_number: 2,
1097
- },
1098
- ];
1099
-
1100
- let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1101
- assert!(result.chunks.len() >= 2);
1102
-
1103
- assert_eq!(result.chunks[0].metadata.first_page, Some(1));
1104
-
1105
- let last_chunk = result.chunks.last().unwrap();
1106
- assert_eq!(last_chunk.metadata.last_page, Some(2));
1107
- }
1108
-
1109
- #[test]
1110
- fn test_chunk_without_page_boundaries() {
1111
- let config = ChunkingConfig {
1112
- max_characters: 30,
1113
- overlap: 5,
1114
- trim: true,
1115
- chunker_type: ChunkerType::Text,
1116
- };
1117
- let text = "This is some test content that should be split into multiple chunks.";
1118
-
1119
- let result = chunk_text(text, &config, None).unwrap();
1120
- assert!(result.chunks.len() >= 2);
1121
-
1122
- for chunk in &result.chunks {
1123
- assert_eq!(chunk.metadata.first_page, None);
1124
- assert_eq!(chunk.metadata.last_page, None);
1125
- }
1126
- }
1127
-
1128
- #[test]
1129
- fn test_chunk_empty_boundaries() {
1130
- let config = ChunkingConfig {
1131
- max_characters: 30,
1132
- overlap: 5,
1133
- trim: true,
1134
- chunker_type: ChunkerType::Text,
1135
- };
1136
- let text = "Some text content here.";
1137
- let boundaries: Vec<PageBoundary> = vec![];
1138
-
1139
- let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1140
- assert_eq!(result.chunks.len(), 1);
1141
-
1142
- assert_eq!(result.chunks[0].metadata.first_page, None);
1143
- assert_eq!(result.chunks[0].metadata.last_page, None);
1144
- }
1145
-
1146
- #[test]
1147
- fn test_chunk_spanning_multiple_pages() {
1148
- use crate::types::PageBoundary;
1149
-
1150
- let config = ChunkingConfig {
1151
- max_characters: 50,
1152
- overlap: 5,
1153
- trim: false,
1154
- chunker_type: ChunkerType::Text,
1155
- };
1156
- let text = "0123456789 AAAAAAAAAA 1111111111 BBBBBBBBBB 2222222222";
1157
-
1158
- let boundaries = vec![
1159
- PageBoundary {
1160
- byte_start: 0,
1161
- byte_end: 20,
1162
- page_number: 1,
1163
- },
1164
- PageBoundary {
1165
- byte_start: 20,
1166
- byte_end: 40,
1167
- page_number: 2,
1168
- },
1169
- PageBoundary {
1170
- byte_start: 40,
1171
- byte_end: 54,
1172
- page_number: 3,
1173
- },
1174
- ];
1175
-
1176
- let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1177
- assert!(result.chunks.len() >= 2);
1178
-
1179
- for chunk in &result.chunks {
1180
- assert!(chunk.metadata.first_page.is_some() || chunk.metadata.last_page.is_some());
1181
- }
1182
- }
1183
-
1184
- #[test]
1185
- fn test_chunk_text_with_invalid_boundary_range() {
1186
- use crate::types::PageBoundary;
1187
-
1188
- let config = ChunkingConfig {
1189
- max_characters: 30,
1190
- overlap: 5,
1191
- trim: true,
1192
- chunker_type: ChunkerType::Text,
1193
- };
1194
- let text = "Page one content here. Page two content.";
1195
-
1196
- let boundaries = vec![PageBoundary {
1197
- byte_start: 10,
1198
- byte_end: 5,
1199
- page_number: 1,
1200
- }];
1201
-
1202
- let result = chunk_text(text, &config, Some(&boundaries));
1203
- assert!(result.is_err());
1204
- let err = result.unwrap_err();
1205
- assert!(err.to_string().contains("Invalid boundary range"));
1206
- assert!(err.to_string().contains("byte_start"));
1207
- }
1208
-
1209
- #[test]
1210
- fn test_chunk_text_with_unsorted_boundaries() {
1211
- use crate::types::PageBoundary;
1212
-
1213
- let config = ChunkingConfig {
1214
- max_characters: 30,
1215
- overlap: 5,
1216
- trim: true,
1217
- chunker_type: ChunkerType::Text,
1218
- };
1219
- let text = "Page one content here. Page two content.";
1220
-
1221
- let boundaries = vec![
1222
- PageBoundary {
1223
- byte_start: 22,
1224
- byte_end: 40,
1225
- page_number: 2,
1226
- },
1227
- PageBoundary {
1228
- byte_start: 0,
1229
- byte_end: 21,
1230
- page_number: 1,
1231
- },
1232
- ];
1233
-
1234
- let result = chunk_text(text, &config, Some(&boundaries));
1235
- assert!(result.is_err());
1236
- let err = result.unwrap_err();
1237
- assert!(err.to_string().contains("not sorted"));
1238
- assert!(err.to_string().contains("boundaries"));
1239
- }
1240
-
1241
- #[test]
1242
- fn test_chunk_text_with_overlapping_boundaries() {
1243
- use crate::types::PageBoundary;
1244
-
1245
- let config = ChunkingConfig {
1246
- max_characters: 30,
1247
- overlap: 5,
1248
- trim: true,
1249
- chunker_type: ChunkerType::Text,
1250
- };
1251
- let text = "Page one content here. Page two content.";
1252
-
1253
- let boundaries = vec![
1254
- PageBoundary {
1255
- byte_start: 0,
1256
- byte_end: 25,
1257
- page_number: 1,
1258
- },
1259
- PageBoundary {
1260
- byte_start: 20,
1261
- byte_end: 40,
1262
- page_number: 2,
1263
- },
1264
- ];
1265
-
1266
- let result = chunk_text(text, &config, Some(&boundaries));
1267
- assert!(result.is_err());
1268
- let err = result.unwrap_err();
1269
- assert!(err.to_string().contains("Overlapping"));
1270
- assert!(err.to_string().contains("boundaries"));
1271
- }
1272
-
1273
- #[test]
1274
- fn test_calculate_page_range_with_invalid_boundaries() {
1275
- use crate::types::PageBoundary;
1276
-
1277
- let boundaries = vec![PageBoundary {
1278
- byte_start: 15,
1279
- byte_end: 10,
1280
- page_number: 1,
1281
- }];
1282
-
1283
- let result = calculate_page_range(0, 20, &boundaries);
1284
- assert!(result.is_err());
1285
- let err = result.unwrap_err();
1286
- assert!(err.to_string().contains("Invalid boundary range"));
1287
- }
1288
-
1289
- #[test]
1290
- fn test_validate_page_boundaries_valid() {
1291
- use crate::types::PageBoundary;
1292
-
1293
- let boundaries = vec![
1294
- PageBoundary {
1295
- byte_start: 0,
1296
- byte_end: 20,
1297
- page_number: 1,
1298
- },
1299
- PageBoundary {
1300
- byte_start: 20,
1301
- byte_end: 40,
1302
- page_number: 2,
1303
- },
1304
- PageBoundary {
1305
- byte_start: 40,
1306
- byte_end: 60,
1307
- page_number: 3,
1308
- },
1309
- ];
1310
-
1311
- let result = chunk_text(
1312
- "x".repeat(60).as_str(),
1313
- &ChunkingConfig {
1314
- max_characters: 30,
1315
- overlap: 5,
1316
- trim: false,
1317
- chunker_type: ChunkerType::Text,
1318
- },
1319
- Some(&boundaries),
1320
- );
1321
- assert!(result.is_ok());
1322
- }
1323
-
1324
- #[test]
1325
- fn test_validate_page_boundaries_empty() {
1326
- let boundaries: Vec<PageBoundary> = vec![];
1327
- let result = chunk_text(
1328
- "Some test text",
1329
- &ChunkingConfig {
1330
- max_characters: 30,
1331
- overlap: 5,
1332
- trim: true,
1333
- chunker_type: ChunkerType::Text,
1334
- },
1335
- Some(&boundaries),
1336
- );
1337
- assert!(result.is_ok());
1338
- }
1339
-
1340
- #[test]
1341
- fn test_page_boundaries_with_gaps() {
1342
- use crate::types::PageBoundary;
1343
-
1344
- let boundaries = vec![
1345
- PageBoundary {
1346
- byte_start: 0,
1347
- byte_end: 10,
1348
- page_number: 1,
1349
- },
1350
- PageBoundary {
1351
- byte_start: 15,
1352
- byte_end: 25,
1353
- page_number: 2,
1354
- },
1355
- ];
1356
-
1357
- let text = "0123456789XXXXX0123456789";
1358
- let result = chunk_text(
1359
- text,
1360
- &ChunkingConfig {
1361
- max_characters: 30,
1362
- overlap: 5,
1363
- trim: false,
1364
- chunker_type: ChunkerType::Text,
1365
- },
1366
- Some(&boundaries),
1367
- );
1368
- assert!(result.is_ok());
1369
- }
1370
-
1371
- #[test]
1372
- fn test_chunk_with_same_start_and_end() {
1373
- use crate::types::PageBoundary;
1374
-
1375
- let boundaries = vec![PageBoundary {
1376
- byte_start: 10,
1377
- byte_end: 10,
1378
- page_number: 1,
1379
- }];
1380
-
1381
- let result = chunk_text(
1382
- "test content here",
1383
- &ChunkingConfig {
1384
- max_characters: 30,
1385
- overlap: 5,
1386
- trim: true,
1387
- chunker_type: ChunkerType::Text,
1388
- },
1389
- Some(&boundaries),
1390
- );
1391
- assert!(result.is_err());
1392
- let err = result.unwrap_err();
1393
- assert!(err.to_string().contains("Invalid boundary range"));
1394
- }
1395
-
1396
- #[test]
1397
- fn test_multiple_overlapping_errors() {
1398
- use crate::types::PageBoundary;
1399
-
1400
- let text = "This is a longer test content string that spans more bytes";
1401
- let boundaries = vec![
1402
- PageBoundary {
1403
- byte_start: 20,
1404
- byte_end: 40,
1405
- page_number: 2,
1406
- },
1407
- PageBoundary {
1408
- byte_start: 10,
1409
- byte_end: 35,
1410
- page_number: 1,
1411
- },
1412
- ];
1413
-
1414
- let result = chunk_text(
1415
- text,
1416
- &ChunkingConfig {
1417
- max_characters: 30,
1418
- overlap: 5,
1419
- trim: true,
1420
- chunker_type: ChunkerType::Text,
1421
- },
1422
- Some(&boundaries),
1423
- );
1424
- assert!(result.is_err());
1425
- assert!(result.unwrap_err().to_string().contains("not sorted"));
1426
- }
1427
-
1428
- #[test]
1429
- fn test_chunk_with_pages_basic() {
1430
- use crate::types::PageBoundary;
1431
-
1432
- let config = ChunkingConfig {
1433
- max_characters: 25,
1434
- overlap: 5,
1435
- trim: true,
1436
- chunker_type: ChunkerType::Text,
1437
- };
1438
- let text = "First page content here.Second page content here.Third page.";
1439
-
1440
- let boundaries = vec![
1441
- PageBoundary {
1442
- byte_start: 0,
1443
- byte_end: 24,
1444
- page_number: 1,
1445
- },
1446
- PageBoundary {
1447
- byte_start: 24,
1448
- byte_end: 50,
1449
- page_number: 2,
1450
- },
1451
- PageBoundary {
1452
- byte_start: 50,
1453
- byte_end: 60,
1454
- page_number: 3,
1455
- },
1456
- ];
1457
-
1458
- let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1459
-
1460
- if !result.chunks.is_empty() {
1461
- assert!(result.chunks[0].metadata.first_page.is_some());
1462
- }
1463
- }
1464
-
1465
- #[test]
1466
- fn test_chunk_with_pages_single_page_chunk() {
1467
- use crate::types::PageBoundary;
1468
-
1469
- let config = ChunkingConfig {
1470
- max_characters: 100,
1471
- overlap: 10,
1472
- trim: true,
1473
- chunker_type: ChunkerType::Text,
1474
- };
1475
- let text = "All content on single page fits in one chunk.";
1476
-
1477
- let boundaries = vec![PageBoundary {
1478
- byte_start: 0,
1479
- byte_end: 45,
1480
- page_number: 1,
1481
- }];
1482
-
1483
- let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1484
- assert_eq!(result.chunks.len(), 1);
1485
- assert_eq!(result.chunks[0].metadata.first_page, Some(1));
1486
- assert_eq!(result.chunks[0].metadata.last_page, Some(1));
1487
- }
1488
-
1489
- #[test]
1490
- fn test_chunk_with_pages_no_overlap() {
1491
- use crate::types::PageBoundary;
1492
-
1493
- let config = ChunkingConfig {
1494
- max_characters: 20,
1495
- overlap: 0,
1496
- trim: false,
1497
- chunker_type: ChunkerType::Text,
1498
- };
1499
- let text = "AAAAA BBBBB CCCCC DDDDD";
1500
-
1501
- let boundaries = vec![
1502
- PageBoundary {
1503
- byte_start: 0,
1504
- byte_end: 11,
1505
- page_number: 1,
1506
- },
1507
- PageBoundary {
1508
- byte_start: 11,
1509
- byte_end: 23,
1510
- page_number: 2,
1511
- },
1512
- ];
1513
-
1514
- let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1515
- assert!(!result.chunks.is_empty());
1516
-
1517
- for chunk in &result.chunks {
1518
- if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
1519
- assert!(first <= last);
1520
- }
1521
- }
1522
- }
1523
-
1524
- #[test]
1525
- fn test_calculate_page_range_within_page() {
1526
- let boundaries = vec![
1527
- PageBoundary {
1528
- byte_start: 0,
1529
- byte_end: 100,
1530
- page_number: 1,
1531
- },
1532
- PageBoundary {
1533
- byte_start: 100,
1534
- byte_end: 200,
1535
- page_number: 2,
1536
- },
1537
- ];
1538
-
1539
- let (first, last) = calculate_page_range(10, 50, &boundaries).unwrap();
1540
- assert_eq!(first, Some(1));
1541
- assert_eq!(last, Some(1));
1542
- }
1543
-
1544
- #[test]
1545
- fn test_calculate_page_range_spanning_pages() {
1546
- let boundaries = vec![
1547
- PageBoundary {
1548
- byte_start: 0,
1549
- byte_end: 100,
1550
- page_number: 1,
1551
- },
1552
- PageBoundary {
1553
- byte_start: 100,
1554
- byte_end: 200,
1555
- page_number: 2,
1556
- },
1557
- ];
1558
-
1559
- let (first, last) = calculate_page_range(50, 150, &boundaries).unwrap();
1560
- assert_eq!(first, Some(1));
1561
- assert_eq!(last, Some(2));
1562
- }
1563
-
1564
- #[test]
1565
- fn test_calculate_page_range_empty_boundaries() {
1566
- let boundaries: Vec<PageBoundary> = vec![];
1567
-
1568
- let (first, last) = calculate_page_range(0, 50, &boundaries).unwrap();
1569
- assert_eq!(first, None);
1570
- assert_eq!(last, None);
1571
- }
1572
-
1573
- #[test]
1574
- fn test_calculate_page_range_no_overlap() {
1575
- let boundaries = vec![
1576
- PageBoundary {
1577
- byte_start: 0,
1578
- byte_end: 100,
1579
- page_number: 1,
1580
- },
1581
- PageBoundary {
1582
- byte_start: 100,
1583
- byte_end: 200,
1584
- page_number: 2,
1585
- },
1586
- ];
1587
-
1588
- let (first, last) = calculate_page_range(200, 250, &boundaries).unwrap();
1589
- assert_eq!(first, None);
1590
- assert_eq!(last, None);
1591
- }
1592
-
1593
- #[test]
1594
- fn test_calculate_page_range_three_pages() {
1595
- let boundaries = vec![
1596
- PageBoundary {
1597
- byte_start: 0,
1598
- byte_end: 100,
1599
- page_number: 1,
1600
- },
1601
- PageBoundary {
1602
- byte_start: 100,
1603
- byte_end: 200,
1604
- page_number: 2,
1605
- },
1606
- PageBoundary {
1607
- byte_start: 200,
1608
- byte_end: 300,
1609
- page_number: 3,
1610
- },
1611
- ];
1612
-
1613
- let (first, last) = calculate_page_range(50, 250, &boundaries).unwrap();
1614
- assert_eq!(first, Some(1));
1615
- assert_eq!(last, Some(3));
1616
- }
1617
-
1618
- #[test]
1619
- fn test_chunk_metadata_page_range_accuracy() {
1620
- use crate::types::PageBoundary;
1621
-
1622
- let config = ChunkingConfig {
1623
- max_characters: 30,
1624
- overlap: 5,
1625
- trim: true,
1626
- chunker_type: ChunkerType::Text,
1627
- };
1628
- let text = "Page One Content Here.Page Two.";
1629
-
1630
- let boundaries = vec![
1631
- PageBoundary {
1632
- byte_start: 0,
1633
- byte_end: 21,
1634
- page_number: 1,
1635
- },
1636
- PageBoundary {
1637
- byte_start: 21,
1638
- byte_end: 31,
1639
- page_number: 2,
1640
- },
1641
- ];
1642
-
1643
- let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1644
-
1645
- for chunk in &result.chunks {
1646
- assert_eq!(chunk.metadata.byte_end - chunk.metadata.byte_start, chunk.content.len());
1647
- }
1648
- }
1649
-
1650
- #[test]
1651
- fn test_chunk_page_range_boundary_edge_cases() {
1652
- use crate::types::PageBoundary;
1653
-
1654
- let config = ChunkingConfig {
1655
- max_characters: 10,
1656
- overlap: 2,
1657
- trim: false,
1658
- chunker_type: ChunkerType::Text,
1659
- };
1660
- let text = "0123456789ABCDEFGHIJ";
1661
-
1662
- let boundaries = vec![
1663
- PageBoundary {
1664
- byte_start: 0,
1665
- byte_end: 10,
1666
- page_number: 1,
1667
- },
1668
- PageBoundary {
1669
- byte_start: 10,
1670
- byte_end: 20,
1671
- page_number: 2,
1672
- },
1673
- ];
1674
-
1675
- let result = chunk_text(text, &config, Some(&boundaries)).unwrap();
1676
-
1677
- for chunk in &result.chunks {
1678
- let on_page1 = chunk.metadata.byte_start < 10;
1679
- let on_page2 = chunk.metadata.byte_end > 10;
1680
-
1681
- if on_page1 && on_page2 {
1682
- assert_eq!(chunk.metadata.first_page, Some(1));
1683
- assert_eq!(chunk.metadata.last_page, Some(2));
1684
- } else if on_page1 {
1685
- assert_eq!(chunk.metadata.first_page, Some(1));
1686
- } else if on_page2 {
1687
- assert_eq!(chunk.metadata.first_page, Some(2));
1688
- }
1689
- }
1690
- }
1691
-
1692
- #[test]
1693
- fn test_validate_utf8_boundaries_valid_ascii() {
1694
- use crate::types::PageBoundary;
1695
-
1696
- let text = "This is ASCII text.";
1697
- let boundaries = vec![
1698
- PageBoundary {
1699
- byte_start: 0,
1700
- byte_end: 10,
1701
- page_number: 1,
1702
- },
1703
- PageBoundary {
1704
- byte_start: 10,
1705
- byte_end: 19,
1706
- page_number: 2,
1707
- },
1708
- ];
1709
-
1710
- let result = chunk_text(text, &ChunkingConfig::default(), Some(&boundaries));
1711
- assert!(result.is_ok());
1712
- }
1713
-
1714
- #[test]
1715
- fn test_validate_utf8_boundaries_valid_emoji() {
1716
- use crate::types::PageBoundary;
1717
-
1718
- let text = "Hello 👋 World 🌍 End";
1719
- let config = ChunkingConfig::default();
1720
-
1721
- let boundaries = vec![
1722
- PageBoundary {
1723
- byte_start: 0,
1724
- byte_end: 11,
1725
- page_number: 1,
1726
- },
1727
- PageBoundary {
1728
- byte_start: 11,
1729
- byte_end: 25,
1730
- page_number: 2,
1731
- },
1732
- ];
1733
-
1734
- let result = chunk_text(text, &config, Some(&boundaries));
1735
- assert!(result.is_ok());
1736
- }
1737
-
1738
- #[test]
1739
- fn test_validate_utf8_boundaries_valid_cjk() {
1740
- use crate::types::PageBoundary;
1741
-
1742
- let text = "你好世界 こんにちは 안녕하세요";
1743
- let config = ChunkingConfig::default();
1744
-
1745
- let boundaries = vec![
1746
- PageBoundary {
1747
- byte_start: 0,
1748
- byte_end: 13,
1749
- page_number: 1,
1750
- },
1751
- PageBoundary {
1752
- byte_start: 13,
1753
- byte_end: 44,
1754
- page_number: 2,
1755
- },
1756
- ];
1757
-
1758
- let result = chunk_text(text, &config, Some(&boundaries));
1759
- assert!(result.is_ok());
1760
- }
1761
-
1762
- #[test]
1763
- fn test_validate_utf8_boundaries_invalid_mid_emoji() {
1764
- use crate::types::PageBoundary;
1765
-
1766
- let text = "Hello 👋 World";
1767
- let boundaries = vec![PageBoundary {
1768
- byte_start: 0,
1769
- byte_end: 7,
1770
- page_number: 1,
1771
- }];
1772
-
1773
- let config = ChunkingConfig::default();
1774
- let result = chunk_text(text, &config, Some(&boundaries));
1775
- assert!(result.is_err());
1776
- let err = result.unwrap_err();
1777
- assert!(err.to_string().contains("UTF-8 character boundary"));
1778
- assert!(err.to_string().contains("byte_end=7"));
1779
- }
1780
-
1781
- #[test]
1782
- fn test_validate_utf8_boundaries_invalid_mid_multibyte_cjk() {
1783
- use crate::types::PageBoundary;
1784
-
1785
- let text = "中文文本";
1786
- let boundaries = vec![PageBoundary {
1787
- byte_start: 0,
1788
- byte_end: 1,
1789
- page_number: 1,
1790
- }];
1791
-
1792
- let config = ChunkingConfig::default();
1793
- let result = chunk_text(text, &config, Some(&boundaries));
1794
- assert!(result.is_err());
1795
- let err = result.unwrap_err();
1796
- assert!(err.to_string().contains("UTF-8 character boundary"));
1797
- }
1798
-
1799
- #[test]
1800
- fn test_validate_utf8_boundaries_byte_start_exceeds_length() {
1801
- use crate::types::PageBoundary;
1802
-
1803
- let text = "Short";
1804
- let boundaries = vec![
1805
- PageBoundary {
1806
- byte_start: 0,
1807
- byte_end: 3,
1808
- page_number: 1,
1809
- },
1810
- PageBoundary {
1811
- byte_start: 10,
1812
- byte_end: 15,
1813
- page_number: 2,
1814
- },
1815
- ];
1816
-
1817
- let config = ChunkingConfig::default();
1818
- let result = chunk_text(text, &config, Some(&boundaries));
1819
- assert!(result.is_err());
1820
- let err = result.unwrap_err();
1821
- assert!(err.to_string().contains("exceeds text length"));
1822
- }
1823
-
1824
- #[test]
1825
- fn test_validate_utf8_boundaries_byte_end_exceeds_length() {
1826
- use crate::types::PageBoundary;
1827
-
1828
- let text = "Short";
1829
- let boundaries = vec![PageBoundary {
1830
- byte_start: 0,
1831
- byte_end: 100,
1832
- page_number: 1,
1833
- }];
1834
-
1835
- let config = ChunkingConfig::default();
1836
- let result = chunk_text(text, &config, Some(&boundaries));
1837
- assert!(result.is_err());
1838
- let err = result.unwrap_err();
1839
- assert!(err.to_string().contains("exceeds text length"));
1840
- }
1841
-
1842
- #[test]
1843
- fn test_validate_utf8_boundaries_empty_boundaries() {
1844
- use crate::types::PageBoundary;
1845
-
1846
- let text = "Some text";
1847
- let boundaries: Vec<PageBoundary> = vec![];
1848
-
1849
- let config = ChunkingConfig::default();
1850
- let result = chunk_text(text, &config, Some(&boundaries));
1851
- assert!(result.is_ok());
1852
- }
1853
-
1854
- #[test]
1855
- fn test_validate_utf8_boundaries_at_text_boundaries() {
1856
- use crate::types::PageBoundary;
1857
-
1858
- let text = "Exact boundary test";
1859
- let text_len = text.len();
1860
- let boundaries = vec![PageBoundary {
1861
- byte_start: 0,
1862
- byte_end: text_len,
1863
- page_number: 1,
1864
- }];
1865
-
1866
- let config = ChunkingConfig::default();
1867
- let result = chunk_text(text, &config, Some(&boundaries));
1868
- assert!(result.is_ok());
1869
- }
1870
-
1871
- #[test]
1872
- fn test_validate_utf8_boundaries_mixed_languages() {
1873
- use crate::types::PageBoundary;
1874
-
1875
- let text = "English text mixed with 中文 and français";
1876
- let config = ChunkingConfig::default();
1877
-
1878
- let boundaries = vec![
1879
- PageBoundary {
1880
- byte_start: 0,
1881
- byte_end: 24,
1882
- page_number: 1,
1883
- },
1884
- PageBoundary {
1885
- byte_start: 24,
1886
- byte_end: text.len(),
1887
- page_number: 2,
1888
- },
1889
- ];
1890
-
1891
- let result = chunk_text(text, &config, Some(&boundaries));
1892
- assert!(result.is_ok());
1893
- }
1894
-
1895
- #[test]
1896
- fn test_chunk_text_rejects_invalid_utf8_boundaries() {
1897
- use crate::types::PageBoundary;
1898
-
1899
- let text = "🌍🌎🌏 Three emoji planets";
1900
- let config = ChunkingConfig::default();
1901
-
1902
- let boundaries = vec![PageBoundary {
1903
- byte_start: 0,
1904
- byte_end: 1000,
1905
- page_number: 1,
1906
- }];
1907
-
1908
- let result = chunk_text(text, &config, Some(&boundaries));
1909
- assert!(result.is_err());
1910
- }
1911
-
1912
- #[test]
1913
- fn test_validate_utf8_boundaries_combining_diacriticals() {
1914
- use crate::types::PageBoundary;
1915
-
1916
- let text = "café";
1917
- let config = ChunkingConfig::default();
1918
-
1919
- let boundaries = vec![
1920
- PageBoundary {
1921
- byte_start: 0,
1922
- byte_end: 2,
1923
- page_number: 1,
1924
- },
1925
- PageBoundary {
1926
- byte_start: 2,
1927
- byte_end: text.len(),
1928
- page_number: 2,
1929
- },
1930
- ];
1931
-
1932
- let result = chunk_text(text, &config, Some(&boundaries));
1933
- assert!(result.is_ok());
1934
- }
1935
-
1936
- #[test]
1937
- fn test_validate_utf8_boundaries_error_messages_are_clear() {
1938
- use crate::types::PageBoundary;
1939
-
1940
- let text = "Test 👋 text";
1941
- let config = ChunkingConfig::default();
1942
-
1943
- let boundaries = vec![PageBoundary {
1944
- byte_start: 0,
1945
- byte_end: 6,
1946
- page_number: 1,
1947
- }];
1948
-
1949
- let result = chunk_text(text, &config, Some(&boundaries));
1950
- assert!(result.is_err());
1951
- let err = result.unwrap_err();
1952
- let err_msg = err.to_string();
1953
- assert!(err_msg.contains("UTF-8"));
1954
- assert!(err_msg.contains("boundary"));
1955
- assert!(err_msg.contains("6"));
1956
- }
1957
-
1958
- #[test]
1959
- fn test_validate_utf8_boundaries_multiple_valid_boundaries() {
1960
- use crate::types::PageBoundary;
1961
-
1962
- let text = "First👋Second🌍Third";
1963
- let config = ChunkingConfig::default();
1964
-
1965
- let boundaries = vec![
1966
- PageBoundary {
1967
- byte_start: 0,
1968
- byte_end: 5,
1969
- page_number: 1,
1970
- },
1971
- PageBoundary {
1972
- byte_start: 5,
1973
- byte_end: 9,
1974
- page_number: 2,
1975
- },
1976
- PageBoundary {
1977
- byte_start: 9,
1978
- byte_end: 15,
1979
- page_number: 3,
1980
- },
1981
- PageBoundary {
1982
- byte_start: 15,
1983
- byte_end: 19,
1984
- page_number: 4,
1985
- },
1986
- PageBoundary {
1987
- byte_start: 19,
1988
- byte_end: text.len(),
1989
- page_number: 5,
1990
- },
1991
- ];
1992
-
1993
- let result = chunk_text(text, &config, Some(&boundaries));
1994
- assert!(result.is_ok());
1995
- }
1996
-
1997
- #[test]
1998
- fn test_validate_utf8_boundaries_zero_start_and_end() {
1999
- use crate::types::PageBoundary;
2000
-
2001
- let text = "Text";
2002
- let config = ChunkingConfig::default();
2003
-
2004
- let boundaries = vec![PageBoundary {
2005
- byte_start: 0,
2006
- byte_end: 0,
2007
- page_number: 1,
2008
- }];
2009
-
2010
- let result = chunk_text(text, &config, Some(&boundaries));
2011
- assert!(result.is_err());
2012
- }
2013
-
2014
- #[test]
2015
- fn test_utf8_boundaries_caching_with_many_boundaries() {
2016
- use crate::types::PageBoundary;
2017
-
2018
- let config = ChunkingConfig {
2019
- max_characters: 500,
2020
- overlap: 50,
2021
- trim: true,
2022
- chunker_type: ChunkerType::Text,
2023
- };
2024
-
2025
- let text = "🌍 Hello World ".repeat(200);
2026
- let text_len = text.len();
2027
-
2028
- let mut boundaries = vec![];
2029
- let boundary_count = 10;
2030
- let step = text_len / boundary_count;
2031
-
2032
- for i in 0..boundary_count {
2033
- let start = i * step;
2034
- let end = if i == boundary_count - 1 {
2035
- text_len
2036
- } else {
2037
- (i + 1) * step
2038
- };
2039
-
2040
- if start < end
2041
- && start <= text_len
2042
- && end <= text_len
2043
- && let Some(boundary_start) = text[..start].char_indices().last().map(|(idx, _)| idx)
2044
- && let Some(boundary_end) = text[..end].char_indices().last().map(|(idx, _)| idx)
2045
- {
2046
- boundaries.push(PageBoundary {
2047
- byte_start: boundary_start,
2048
- byte_end: boundary_end,
2049
- page_number: i + 1,
2050
- });
2051
- }
2052
- }
2053
-
2054
- if !boundaries.is_empty() {
2055
- let result = chunk_text(&text, &config, Some(&boundaries));
2056
- assert!(
2057
- result.is_ok(),
2058
- "Failed to chunk text with {} boundaries",
2059
- boundaries.len()
2060
- );
2061
-
2062
- let chunks = result.unwrap();
2063
- assert!(chunks.chunk_count > 0);
2064
- }
2065
- }
2066
-
2067
- #[test]
2068
- fn test_utf8_boundaries_caching_large_document_with_emojis() {
2069
- use crate::types::PageBoundary;
2070
-
2071
- let config = ChunkingConfig {
2072
- max_characters: 1000,
2073
- overlap: 100,
2074
- trim: true,
2075
- chunker_type: ChunkerType::Text,
2076
- };
2077
-
2078
- let large_text = "This is a large document with lots of emoji: 🌍 🚀 💻 🎉 🔥 ✨ 🎨 🌟 ".repeat(100);
2079
-
2080
- let all_indices: Vec<usize> = large_text.char_indices().map(|(idx, _)| idx).collect();
2081
-
2082
- let third_idx = all_indices.len() / 3;
2083
- let two_thirds_idx = (2 * all_indices.len()) / 3;
2084
-
2085
- let boundary_start_1 = if third_idx < all_indices.len() {
2086
- all_indices[third_idx]
2087
- } else {
2088
- large_text.len()
2089
- };
2090
-
2091
- let boundary_start_2 = if two_thirds_idx < all_indices.len() {
2092
- all_indices[two_thirds_idx]
2093
- } else {
2094
- large_text.len()
2095
- };
2096
-
2097
- let boundaries = vec![
2098
- PageBoundary {
2099
- byte_start: 0,
2100
- byte_end: boundary_start_1,
2101
- page_number: 1,
2102
- },
2103
- PageBoundary {
2104
- byte_start: boundary_start_1,
2105
- byte_end: boundary_start_2,
2106
- page_number: 2,
2107
- },
2108
- PageBoundary {
2109
- byte_start: boundary_start_2,
2110
- byte_end: large_text.len(),
2111
- page_number: 3,
2112
- },
2113
- ];
2114
-
2115
- let result = chunk_text(&large_text, &config, Some(&boundaries));
2116
- assert!(result.is_ok());
2117
-
2118
- let chunks = result.unwrap();
2119
- assert!(!chunks.chunks.is_empty());
2120
-
2121
- for chunk in &chunks.chunks {
2122
- assert!(!chunk.content.is_empty());
2123
- if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
2124
- assert!(first <= last);
2125
- }
2126
- }
2127
- }
2128
-
2129
- #[test]
2130
- fn test_adaptive_validation_small_boundary_set() {
2131
- use crate::types::PageBoundary;
2132
-
2133
- let config = ChunkingConfig {
2134
- max_characters: 100,
2135
- overlap: 10,
2136
- trim: true,
2137
- chunker_type: ChunkerType::Text,
2138
- };
2139
- let text = "Hello 👋 World 🌍 End";
2140
-
2141
- let boundaries = vec![
2142
- PageBoundary {
2143
- byte_start: 0,
2144
- byte_end: 6,
2145
- page_number: 1,
2146
- },
2147
- PageBoundary {
2148
- byte_start: 6,
2149
- byte_end: 15,
2150
- page_number: 2,
2151
- },
2152
- PageBoundary {
2153
- byte_start: 15,
2154
- byte_end: text.len(),
2155
- page_number: 3,
2156
- },
2157
- ];
2158
-
2159
- let result = chunk_text(text, &config, Some(&boundaries));
2160
- assert!(result.is_ok());
2161
- }
2162
-
2163
- #[test]
2164
- fn test_adaptive_validation_threshold_boundary() {
2165
- use crate::types::PageBoundary;
2166
-
2167
- let config = ChunkingConfig {
2168
- max_characters: 200,
2169
- overlap: 20,
2170
- trim: true,
2171
- chunker_type: ChunkerType::Text,
2172
- };
2173
- let text = "Test text ".repeat(50);
2174
- let text_len = text.len();
2175
-
2176
- let mut boundaries = vec![];
2177
- let step = text_len / ADAPTIVE_VALIDATION_THRESHOLD;
2178
-
2179
- for i in 0..ADAPTIVE_VALIDATION_THRESHOLD {
2180
- let start = i * step;
2181
- let end = if i == ADAPTIVE_VALIDATION_THRESHOLD - 1 {
2182
- text_len
2183
- } else {
2184
- (i + 1) * step
2185
- };
2186
-
2187
- if start < end
2188
- && start <= text_len
2189
- && end <= text_len
2190
- && let Some(boundary_start) = text[..start.min(text_len - 1)]
2191
- .char_indices()
2192
- .last()
2193
- .map(|(idx, _)| idx)
2194
- && let Some(boundary_end) = text[..end.min(text_len)].char_indices().last().map(|(idx, _)| idx)
2195
- && boundary_start < boundary_end
2196
- {
2197
- boundaries.push(PageBoundary {
2198
- byte_start: boundary_start,
2199
- byte_end: boundary_end,
2200
- page_number: i + 1,
2201
- });
2202
- }
2203
- }
2204
-
2205
- if !boundaries.is_empty() {
2206
- let result = chunk_text(&text, &config, Some(&boundaries));
2207
- assert!(result.is_ok());
2208
- }
2209
- }
2210
-
2211
- #[test]
2212
- fn test_adaptive_validation_large_boundary_set() {
2213
- use crate::types::PageBoundary;
2214
-
2215
- let config = ChunkingConfig {
2216
- max_characters: 500,
2217
- overlap: 50,
2218
- trim: true,
2219
- chunker_type: ChunkerType::Text,
2220
- };
2221
- let text = "Lorem ipsum dolor sit amet ".repeat(100);
2222
- let text_len = text.len();
2223
-
2224
- let mut boundaries = vec![];
2225
- let boundary_count = 50;
2226
- let step = text_len / boundary_count;
2227
-
2228
- for i in 0..boundary_count {
2229
- let start = i * step;
2230
- let end = if i == boundary_count - 1 {
2231
- text_len
2232
- } else {
2233
- (i + 1) * step
2234
- };
2235
-
2236
- if start < end
2237
- && start <= text_len
2238
- && end <= text_len
2239
- && let Some(boundary_start) = text[..start.min(text_len - 1)]
2240
- .char_indices()
2241
- .last()
2242
- .map(|(idx, _)| idx)
2243
- && let Some(boundary_end) = text[..end.min(text_len)].char_indices().last().map(|(idx, _)| idx)
2244
- && boundary_start < boundary_end
2245
- {
2246
- boundaries.push(PageBoundary {
2247
- byte_start: boundary_start,
2248
- byte_end: boundary_end,
2249
- page_number: i + 1,
2250
- });
2251
- }
2252
- }
2253
-
2254
- if !boundaries.is_empty() {
2255
- let result = chunk_text(&text, &config, Some(&boundaries));
2256
- assert!(result.is_ok());
2257
- }
2258
- }
2259
-
2260
- #[test]
2261
- fn test_adaptive_validation_consistency() {
2262
- use crate::types::PageBoundary;
2263
-
2264
- let config = ChunkingConfig {
2265
- max_characters: 300,
2266
- overlap: 30,
2267
- trim: true,
2268
- chunker_type: ChunkerType::Text,
2269
- };
2270
- let text = "Mixed language: 你好 مرحبا Здравствуй ".repeat(50);
2271
-
2272
- let boundaries = vec![
2273
- PageBoundary {
2274
- byte_start: 0,
2275
- byte_end: 50,
2276
- page_number: 1,
2277
- },
2278
- PageBoundary {
2279
- byte_start: 50,
2280
- byte_end: 100,
2281
- page_number: 2,
2282
- },
2283
- PageBoundary {
2284
- byte_start: 100,
2285
- byte_end: 150,
2286
- page_number: 3,
2287
- },
2288
- PageBoundary {
2289
- byte_start: 150,
2290
- byte_end: 200,
2291
- page_number: 4,
2292
- },
2293
- PageBoundary {
2294
- byte_start: 200,
2295
- byte_end: text.len(),
2296
- page_number: 5,
2297
- },
2298
- ];
2299
-
2300
- let result = chunk_text(&text, &config, Some(&boundaries));
2301
- let _ = result;
2302
- }
2303
677
  }