kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -19,18 +19,6 @@ use kreuzberg::core::extractor::extract_file_sync;
19
19
 
20
20
  mod helpers;
21
21
 
22
- fn trim_trailing_newlines(value: &str) -> &str {
23
- value.trim_end_matches(['\n', '\r'])
24
- }
25
-
26
- fn assert_text_content(actual: &str, expected: &str) {
27
- assert_eq!(
28
- trim_trailing_newlines(actual),
29
- expected,
30
- "Content mismatch after trimming trailing newlines"
31
- );
32
- }
33
-
34
22
  /// Test that batch extraction processes documents in parallel.
35
23
  ///
36
24
  /// Validates:
@@ -307,13 +295,9 @@ async fn test_batch_bytes_parallel_processing() {
307
295
  .collect();
308
296
 
309
297
  let contents_ref: Vec<(&[u8], &str)> = contents.iter().map(|(bytes, mime)| (bytes.as_slice(), *mime)).collect();
310
- let owned_contents: Vec<(Vec<u8>, String)> = contents_ref
311
- .into_iter()
312
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
313
- .collect();
314
298
 
315
299
  let start = Instant::now();
316
- let results = batch_extract_bytes(owned_contents, &config).await;
300
+ let results = batch_extract_bytes(contents_ref, &config).await;
317
301
  let duration = start.elapsed();
318
302
 
319
303
  assert!(results.is_ok());
@@ -321,8 +305,7 @@ async fn test_batch_bytes_parallel_processing() {
321
305
  assert_eq!(results.len(), 30);
322
306
 
323
307
  for (i, result) in results.iter().enumerate() {
324
- let expected = format!("Test content number {}", i);
325
- assert_text_content(&result.content, &expected);
308
+ assert_eq!(result.content, format!("Test content number {}", i));
326
309
  }
327
310
 
328
311
  println!("Batch processed 30 byte arrays in {:?}", duration);
@@ -341,20 +324,15 @@ async fn test_batch_bytes_mixed_valid_invalid() {
341
324
  (b"valid content 3".as_slice(), "text/plain"),
342
325
  ];
343
326
 
344
- let owned_contents: Vec<(Vec<u8>, String)> = contents
345
- .into_iter()
346
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
347
- .collect();
348
-
349
- let results = batch_extract_bytes(owned_contents, &config).await;
327
+ let results = batch_extract_bytes(contents, &config).await;
350
328
 
351
329
  assert!(results.is_ok());
352
330
  let results = results.unwrap();
353
331
  assert_eq!(results.len(), 5);
354
332
 
355
- assert_text_content(&results[0].content, "valid content 1");
356
- assert_text_content(&results[2].content, "valid content 2");
357
- assert_text_content(&results[4].content, "valid content 3");
333
+ assert_eq!(results[0].content, "valid content 1");
334
+ assert_eq!(results[2].content, "valid content 2");
335
+ assert_eq!(results[4].content, "valid content 3");
358
336
 
359
337
  assert!(results[1].metadata.error.is_some());
360
338
  assert!(results[3].metadata.error.is_some());
@@ -383,13 +361,9 @@ async fn test_batch_utilizes_multiple_cores() {
383
361
  }
384
362
 
385
363
  let contents_ref: Vec<(&[u8], &str)> = contents.iter().map(|(bytes, mime)| (bytes.as_slice(), *mime)).collect();
386
- let owned_contents: Vec<(Vec<u8>, String)> = contents_ref
387
- .into_iter()
388
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
389
- .collect();
390
364
 
391
365
  let start = Instant::now();
392
- let results = batch_extract_bytes(owned_contents, &config).await;
366
+ let results = batch_extract_bytes(contents_ref, &config).await;
393
367
  let duration = start.elapsed();
394
368
 
395
369
  assert!(results.is_ok());
@@ -426,13 +400,9 @@ async fn test_batch_memory_pressure_handling() {
426
400
  }
427
401
 
428
402
  let contents_ref: Vec<(&[u8], &str)> = contents.iter().map(|(bytes, mime)| (bytes.as_slice(), *mime)).collect();
429
- let owned_contents: Vec<(Vec<u8>, String)> = contents_ref
430
- .into_iter()
431
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
432
- .collect();
433
403
 
434
404
  let start = Instant::now();
435
- let results = batch_extract_bytes(owned_contents, &config).await;
405
+ let results = batch_extract_bytes(contents_ref, &config).await;
436
406
  let duration = start.elapsed();
437
407
 
438
408
  assert!(results.is_ok());
@@ -462,13 +432,8 @@ async fn test_batch_scales_with_cpu_count() {
462
432
 
463
433
  let contents_ref: Vec<(&[u8], &str)> = contents.iter().map(|(bytes, mime)| (bytes.as_slice(), *mime)).collect();
464
434
 
465
- let owned_contents_1: Vec<(Vec<u8>, String)> = contents_ref
466
- .iter()
467
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
468
- .collect();
469
-
470
435
  let start = Instant::now();
471
- let _ = batch_extract_bytes(owned_contents_1, &config_1).await.unwrap();
436
+ let _ = batch_extract_bytes(contents_ref.clone(), &config_1).await.unwrap();
472
437
  let duration_1 = start.elapsed();
473
438
 
474
439
  let config_full = ExtractionConfig {
@@ -476,13 +441,8 @@ async fn test_batch_scales_with_cpu_count() {
476
441
  ..Default::default()
477
442
  };
478
443
 
479
- let owned_contents_full: Vec<(Vec<u8>, String)> = contents_ref
480
- .into_iter()
481
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
482
- .collect();
483
-
484
444
  let start = Instant::now();
485
- let _ = batch_extract_bytes(owned_contents_full, &config_full).await.unwrap();
445
+ let _ = batch_extract_bytes(contents_ref, &config_full).await.unwrap();
486
446
  let duration_full = start.elapsed();
487
447
 
488
448
  println!(
@@ -566,20 +526,15 @@ async fn test_batch_accuracy_under_load() {
566
526
  }
567
527
 
568
528
  let contents_ref: Vec<(&[u8], &str)> = contents.iter().map(|(bytes, mime)| (bytes.as_slice(), *mime)).collect();
569
- let owned_contents: Vec<(Vec<u8>, String)> = contents_ref
570
- .into_iter()
571
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
572
- .collect();
573
529
 
574
- let results = batch_extract_bytes(owned_contents, &config).await.unwrap();
530
+ let results = batch_extract_bytes(contents_ref, &config).await.unwrap();
575
531
 
576
532
  assert_eq!(results.len(), 100);
577
533
 
578
534
  for (i, result) in results.iter().enumerate() {
579
535
  let expected = format!("Document number {} with unique content", i);
580
536
  assert_eq!(
581
- trim_trailing_newlines(&result.content),
582
- expected,
537
+ result.content, expected,
583
538
  "Document {} content mismatch - possible cross-contamination",
584
539
  i
585
540
  );
@@ -4,29 +4,16 @@
4
4
  //! Validates concurrent processing, error handling, and performance.
5
5
 
6
6
  use kreuzberg::core::config::ExtractionConfig;
7
- #[cfg(feature = "pdf")]
8
- use kreuzberg::core::extractor::batch_extract_file_sync;
9
- use kreuzberg::core::extractor::{batch_extract_bytes, batch_extract_bytes_sync, batch_extract_file};
7
+ use kreuzberg::core::extractor::{
8
+ batch_extract_bytes, batch_extract_bytes_sync, batch_extract_file, batch_extract_file_sync,
9
+ };
10
10
  use std::path::PathBuf;
11
11
 
12
12
  mod helpers;
13
13
  use helpers::{get_test_documents_dir, get_test_file_path, skip_if_missing, test_documents_available};
14
14
 
15
- fn trim_trailing_newlines(value: &str) -> &str {
16
- value.trim_end_matches(['\n', '\r'])
17
- }
18
-
19
- fn assert_text_content(actual: &str, expected: &str) {
20
- assert_eq!(
21
- trim_trailing_newlines(actual),
22
- expected,
23
- "Content mismatch after trimming trailing newlines"
24
- );
25
- }
26
-
27
15
  /// Test batch extraction with multiple file formats (PDF, DOCX, TXT).
28
16
  #[tokio::test]
29
- #[cfg(all(feature = "pdf", feature = "office", feature = "tokio-runtime"))]
30
17
  async fn test_batch_extract_file_multiple_formats() {
31
18
  if !test_documents_available() {
32
19
  println!("Skipping test: test_documents/ directory not found");
@@ -74,7 +61,6 @@ async fn test_batch_extract_file_multiple_formats() {
74
61
 
75
62
  /// Test synchronous batch extraction variant.
76
63
  #[test]
77
- #[cfg(feature = "pdf")]
78
64
  fn test_batch_extract_file_sync_variant() {
79
65
  if !test_documents_available() {
80
66
  println!("Skipping test: test_documents/ directory not found");
@@ -129,19 +115,14 @@ async fn test_batch_extract_bytes_multiple() {
129
115
  (json_bytes.as_slice(), "application/json"),
130
116
  ];
131
117
 
132
- let owned_contents: Vec<(Vec<u8>, String)> = contents
133
- .into_iter()
134
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
135
- .collect();
136
-
137
- let results = batch_extract_bytes(owned_contents, &config).await;
118
+ let results = batch_extract_bytes(contents, &config).await;
138
119
 
139
120
  assert!(results.is_ok(), "Batch bytes extraction should succeed");
140
121
  let results = results.unwrap();
141
122
 
142
123
  assert_eq!(results.len(), 3);
143
124
 
144
- assert_text_content(&results[0].content, "This is plain text content");
125
+ assert_eq!(results[0].content, "This is plain text content");
145
126
  assert_eq!(results[0].mime_type, "text/plain");
146
127
 
147
128
  assert!(results[1].content.contains("Markdown Header"));
@@ -311,18 +292,13 @@ fn test_batch_extract_bytes_sync_variant() {
311
292
  (b"# content 3".as_slice(), "text/markdown"),
312
293
  ];
313
294
 
314
- let owned_contents: Vec<(Vec<u8>, String)> = contents
315
- .into_iter()
316
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
317
- .collect();
318
-
319
- let results = batch_extract_bytes_sync(owned_contents, &config);
295
+ let results = batch_extract_bytes_sync(contents, &config);
320
296
 
321
297
  assert!(results.is_ok(), "Sync batch bytes extraction should succeed");
322
298
  let results = results.unwrap();
323
299
 
324
300
  assert_eq!(results.len(), 3);
325
- assert_text_content(&results[0].content, "content 1");
326
- assert_text_content(&results[1].content, "content 2");
301
+ assert_eq!(results[0].content, "content 1");
302
+ assert_eq!(results[1].content, "content 2");
327
303
  assert!(results[2].content.contains("content 3"));
328
304
  }
@@ -0,0 +1,92 @@
1
+ #[cfg(feature = "chunking")]
2
+ #[test]
3
+ fn demonstrate_correct_offset_calculation() {
4
+ use kreuzberg::chunking::{ChunkerType, ChunkingConfig, chunk_text};
5
+
6
+ println!("\n=== Demonstrating Correct Chunking Offset Calculation ===\n");
7
+
8
+ let config_with_overlap = ChunkingConfig {
9
+ max_characters: 20,
10
+ overlap: 5,
11
+ trim: false,
12
+ chunker_type: ChunkerType::Text,
13
+ };
14
+
15
+ let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
16
+ println!("Text: \"{}\"", text);
17
+ println!(
18
+ "Max characters: {}, Overlap: {}\n",
19
+ config_with_overlap.max_characters, config_with_overlap.overlap
20
+ );
21
+
22
+ let result = chunk_text(text, &config_with_overlap).unwrap();
23
+
24
+ println!("WITH OVERLAP (5 chars):");
25
+ for (i, chunk) in result.chunks.iter().enumerate() {
26
+ println!(
27
+ " Chunk {}: [{:3} - {:3}] = \"{}\"",
28
+ i,
29
+ chunk.metadata.char_start,
30
+ chunk.metadata.char_end,
31
+ chunk.content.replace('\n', "\\n")
32
+ );
33
+ }
34
+
35
+ println!("\nOverlap verification:");
36
+ for i in 0..result.chunks.len() - 1 {
37
+ let current = &result.chunks[i];
38
+ let next = &result.chunks[i + 1];
39
+ let overlap_size = current.metadata.char_end - next.metadata.char_start;
40
+ println!(
41
+ " Chunks {} and {}: overlap = {} chars (next starts at {} while current ends at {})",
42
+ i,
43
+ i + 1,
44
+ overlap_size,
45
+ next.metadata.char_start,
46
+ current.metadata.char_end
47
+ );
48
+ assert!(
49
+ overlap_size > 0 && overlap_size <= config_with_overlap.overlap + 10,
50
+ "Overlap should exist and be reasonable"
51
+ );
52
+ }
53
+
54
+ println!("\n\n=== Without Overlap ===\n");
55
+ let config_no_overlap = ChunkingConfig {
56
+ max_characters: 20,
57
+ overlap: 0,
58
+ trim: false,
59
+ chunker_type: ChunkerType::Text,
60
+ };
61
+
62
+ let result_no_overlap = chunk_text(text, &config_no_overlap).unwrap();
63
+
64
+ println!("WITHOUT OVERLAP:");
65
+ for (i, chunk) in result_no_overlap.chunks.iter().enumerate() {
66
+ println!(
67
+ " Chunk {}: [{:3} - {:3}] = \"{}\"",
68
+ i,
69
+ chunk.metadata.char_start,
70
+ chunk.metadata.char_end,
71
+ chunk.content.replace('\n', "\\n")
72
+ );
73
+ }
74
+
75
+ println!("\nAdjacency verification:");
76
+ for i in 0..result_no_overlap.chunks.len() - 1 {
77
+ let current = &result_no_overlap.chunks[i];
78
+ let next = &result_no_overlap.chunks[i + 1];
79
+ let gap = next.metadata.char_start as i32 - current.metadata.char_end as i32;
80
+ println!(
81
+ " Chunks {} and {}: gap = {} (next starts at {}, current ends at {})",
82
+ i,
83
+ i + 1,
84
+ gap,
85
+ next.metadata.char_start,
86
+ current.metadata.char_end
87
+ );
88
+ assert!(gap >= 0, "Should have no overlap (gap >= 0)");
89
+ }
90
+
91
+ println!("\n✓ All offset calculations are correct!");
92
+ }
@@ -18,6 +18,7 @@ use kreuzberg::plugins::registry::{get_document_extractor_registry, get_post_pro
18
18
  use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
19
19
  use kreuzberg::types::{ExtractionResult, Metadata};
20
20
  use std::sync::Arc;
21
+ use std::sync::atomic::{AtomicUsize, Ordering};
21
22
 
22
23
  #[cfg(feature = "ocr")]
23
24
  use kreuzberg::core::config::OcrConfig;
@@ -29,18 +30,6 @@ use tokio::time::timeout;
29
30
 
30
31
  mod helpers;
31
32
 
32
- fn trim_trailing_newlines(value: &str) -> &str {
33
- value.trim_end_matches(['\n', '\r'])
34
- }
35
-
36
- fn assert_text_content(actual: &str, expected: &str) {
37
- assert_eq!(
38
- trim_trailing_newlines(actual),
39
- expected,
40
- "Content mismatch after trimming trailing newlines"
41
- );
42
- }
43
-
44
33
  /// Test many concurrent extractions of different MIME types.
45
34
  ///
46
35
  /// Validates that:
@@ -51,16 +40,13 @@ fn assert_text_content(actual: &str, expected: &str) {
51
40
  async fn test_concurrent_extractions_mixed_formats() {
52
41
  let config = ExtractionConfig::default();
53
42
 
54
- #[allow(unused_mut)]
55
- let mut test_cases = vec![
43
+ let test_cases = vec![
56
44
  (b"Plain text content" as &[u8], "text/plain"),
57
45
  (b"{\"key\": \"value\"}", "application/json"),
46
+ (b"<root><item>XML content</item></root>", "application/xml"),
58
47
  (b"# Markdown\n\nContent here", "text/markdown"),
59
48
  ];
60
49
 
61
- #[cfg(feature = "xml")]
62
- test_cases.push((b"<root><item>XML content</item></root>" as &[u8], "application/xml"));
63
-
64
50
  let mut handles = vec![];
65
51
  for _ in 0..10 {
66
52
  for (data, mime_type) in &test_cases {
@@ -109,11 +95,7 @@ async fn test_concurrent_batch_extractions() {
109
95
 
110
96
  handles.push(tokio::spawn(async move {
111
97
  let data: Vec<(&[u8], &str)> = contents_clone.iter().map(|c| (c.as_slice(), "text/plain")).collect();
112
- let owned_data: Vec<(Vec<u8>, String)> = data
113
- .into_iter()
114
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
115
- .collect();
116
- batch_extract_bytes(owned_data, &config).await
98
+ batch_extract_bytes(data, &config).await
117
99
  }));
118
100
  }
119
101
 
@@ -139,8 +121,6 @@ async fn test_concurrent_extractions_with_cache() {
139
121
  enabled: false,
140
122
  enabled_processors: None,
141
123
  disabled_processors: None,
142
- enabled_set: None,
143
- disabled_set: None,
144
124
  }),
145
125
  ..Default::default()
146
126
  };
@@ -164,7 +144,7 @@ async fn test_concurrent_extractions_with_cache() {
164
144
  let result = handle.await.expect("Task should not panic");
165
145
  assert!(result.is_ok(), "Cache read should succeed");
166
146
  let extraction = result.unwrap();
167
- assert_text_content(&extraction.content, expected_content);
147
+ assert_eq!(extraction.content, expected_content);
168
148
  }
169
149
  }
170
150
 
@@ -179,10 +159,6 @@ async fn test_concurrent_extractions_with_cache() {
179
159
  async fn test_concurrent_ocr_processing() {
180
160
  use helpers::{get_test_file_path, skip_if_missing};
181
161
 
182
- if cfg!(windows) {
183
- return;
184
- }
185
-
186
162
  if skip_if_missing("images/ocr_image.jpg") {
187
163
  tracing::debug!("Skipping concurrent OCR test: test file not available");
188
164
  return;
@@ -250,7 +226,6 @@ async fn test_concurrent_ocr_processing() {
250
226
  #[test]
251
227
  fn test_concurrent_ocr_cache_stress() {
252
228
  use helpers::{get_test_file_path, skip_if_missing};
253
- use std::sync::atomic::{AtomicUsize, Ordering};
254
229
 
255
230
  if skip_if_missing("images/ocr_image.jpg") {
256
231
  tracing::debug!("Skipping OCR cache stress test: test file not available");
@@ -358,8 +333,6 @@ async fn test_concurrent_pipeline_processing() {
358
333
  enabled: true,
359
334
  enabled_processors: Some(vec!["concurrent-test".to_string()]),
360
335
  disabled_processors: None,
361
- enabled_set: None,
362
- disabled_set: None,
363
336
  }),
364
337
  ..Default::default()
365
338
  };
@@ -377,7 +350,6 @@ async fn test_concurrent_pipeline_processing() {
377
350
  detected_languages: None,
378
351
  chunks: None,
379
352
  images: None,
380
- pages: None,
381
353
  };
382
354
 
383
355
  run_pipeline(result, &config).await
@@ -498,16 +470,13 @@ async fn test_high_concurrency_stress() {
498
470
  ..Default::default()
499
471
  };
500
472
 
501
- #[allow(unused_mut)]
502
- let mut formats = vec![
473
+ let formats = vec![
503
474
  (b"Text content" as &[u8], "text/plain"),
504
475
  (b"{\"json\": true}", "application/json"),
476
+ (b"<xml><item>content</item></xml>", "application/xml"),
505
477
  (b"# Markdown\n\nContent", "text/markdown"),
506
478
  ];
507
479
 
508
- #[cfg(feature = "xml")]
509
- formats.push((b"<xml><item>content</item></xml>" as &[u8], "application/xml"));
510
-
511
480
  let mut handles = vec![];
512
481
  for _ in 0..100 {
513
482
  for (data, mime_type) in &formats {
@@ -531,10 +500,9 @@ async fn test_high_concurrency_stress() {
531
500
  .await
532
501
  .expect("High-load stress test should complete within 60s");
533
502
 
534
- let expected_successes = 100 * formats.len();
535
503
  let success_count = results.iter().filter(|r| r.is_ok()).count();
536
504
  assert_eq!(
537
- success_count, expected_successes,
505
+ success_count, 400,
538
506
  "All extractions should succeed under stress, got {} successes",
539
507
  success_count
540
508
  );
@@ -3,19 +3,13 @@
3
3
  //! Tests for chunking, language detection, caching, token reduction, and quality processing.
4
4
  //! Validates that configuration options work correctly end-to-end.
5
5
 
6
- #[cfg(feature = "chunking")]
7
- use kreuzberg::core::config::ChunkingConfig;
8
- use kreuzberg::core::config::ExtractionConfig;
9
- #[cfg(feature = "language-detection")]
10
- use kreuzberg::core::config::LanguageDetectionConfig;
11
- use kreuzberg::core::config::TokenReductionConfig;
6
+ use kreuzberg::core::config::{ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, TokenReductionConfig};
12
7
  use kreuzberg::core::extractor::extract_bytes;
13
8
 
14
9
  mod helpers;
15
10
 
16
11
  /// Test chunking enabled - text split into chunks.
17
12
  #[tokio::test]
18
- #[cfg(feature = "chunking")]
19
13
  async fn test_chunking_enabled() {
20
14
  let config = ExtractionConfig {
21
15
  chunking: Some(ChunkingConfig {
@@ -58,7 +52,6 @@ async fn test_chunking_enabled() {
58
52
 
59
53
  /// Test chunking with overlap - overlap preserved between chunks.
60
54
  #[tokio::test]
61
- #[cfg(feature = "chunking")]
62
55
  async fn test_chunking_with_overlap() {
63
56
  let config = ExtractionConfig {
64
57
  chunking: Some(ChunkingConfig {
@@ -98,7 +91,6 @@ async fn test_chunking_with_overlap() {
98
91
 
99
92
  /// Test chunking with custom sizes - custom chunk size and overlap.
100
93
  #[tokio::test]
101
- #[cfg(feature = "chunking")]
102
94
  async fn test_chunking_custom_sizes() {
103
95
  let config = ExtractionConfig {
104
96
  chunking: Some(ChunkingConfig {
@@ -159,7 +151,6 @@ async fn test_chunking_disabled() {
159
151
 
160
152
  /// Test language detection for single language document.
161
153
  #[tokio::test]
162
- #[cfg(feature = "language-detection")]
163
154
  async fn test_language_detection_single() {
164
155
  let config = ExtractionConfig {
165
156
  language_detection: Some(LanguageDetectionConfig {
@@ -186,7 +177,6 @@ async fn test_language_detection_single() {
186
177
  /// Test language detection for multi-language document.
187
178
  #[cfg_attr(coverage, ignore = "coverage instrumentation affects multi-language heuristics")]
188
179
  #[tokio::test]
189
- #[cfg(feature = "language-detection")]
190
180
  async fn test_language_detection_multiple() {
191
181
  let config = ExtractionConfig {
192
182
  language_detection: Some(LanguageDetectionConfig {
@@ -211,7 +201,6 @@ async fn test_language_detection_multiple() {
211
201
 
212
202
  /// Test language detection with confidence threshold.
213
203
  #[tokio::test]
214
- #[cfg(feature = "language-detection")]
215
204
  async fn test_language_detection_confidence() {
216
205
  let config = ExtractionConfig {
217
206
  language_detection: Some(LanguageDetectionConfig {
@@ -236,7 +225,6 @@ async fn test_language_detection_confidence() {
236
225
 
237
226
  /// Test language detection disabled.
238
227
  #[tokio::test]
239
- #[cfg(feature = "language-detection")]
240
228
  async fn test_language_detection_disabled() {
241
229
  let config = ExtractionConfig {
242
230
  language_detection: Some(LanguageDetectionConfig {
@@ -409,7 +397,6 @@ async fn test_token_reduction_disabled() {
409
397
 
410
398
  /// Test quality processing enabled - quality scoring applied.
411
399
  #[tokio::test]
412
- #[cfg(feature = "quality")]
413
400
  async fn test_quality_processing_enabled() {
414
401
  let config = ExtractionConfig {
415
402
  enable_quality_processing: true,
@@ -433,7 +420,6 @@ async fn test_quality_processing_enabled() {
433
420
 
434
421
  /// Test quality processing calculates score for different text quality.
435
422
  #[tokio::test]
436
- #[cfg(feature = "quality")]
437
423
  async fn test_quality_threshold_filtering() {
438
424
  let config = ExtractionConfig {
439
425
  enable_quality_processing: true,
@@ -498,15 +484,8 @@ async fn test_quality_processing_disabled() {
498
484
  }
499
485
 
500
486
  /// Test chunking with embeddings using balanced preset.
501
- ///
502
- /// This test requires ONNX Runtime to be installed as a system dependency.
503
- /// On macOS with Homebrew: `brew install onnxruntime`
504
- /// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
505
- /// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
506
487
  #[tokio::test]
507
488
  #[cfg(feature = "embeddings")]
508
- #[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
509
- #[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
510
489
  async fn test_chunking_with_embeddings() {
511
490
  use kreuzberg::core::config::EmbeddingConfig;
512
491
 
@@ -564,15 +543,8 @@ async fn test_chunking_with_embeddings() {
564
543
  }
565
544
 
566
545
  /// Test chunking with fast embedding preset.
567
- ///
568
- /// This test requires ONNX Runtime to be installed as a system dependency.
569
- /// On macOS with Homebrew: `brew install onnxruntime`
570
- /// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
571
- /// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
572
546
  #[tokio::test]
573
547
  #[cfg(feature = "embeddings")]
574
- #[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
575
- #[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
576
548
  async fn test_chunking_with_fast_embeddings() {
577
549
  use kreuzberg::core::config::{EmbeddingConfig, EmbeddingModelType};
578
550
 
@@ -601,10 +573,6 @@ async fn test_chunking_with_fast_embeddings() {
601
573
  let chunks = result.chunks.expect("Should have chunks");
602
574
  assert!(!chunks.is_empty(), "Should have at least one chunk");
603
575
 
604
- if let Some(error) = result.metadata.additional.get("embedding_error") {
605
- panic!("Embedding generation failed: {}", error);
606
- }
607
-
608
576
  for chunk in &chunks {
609
577
  let embedding = chunk.embedding.as_ref().expect("Should have embedding");
610
578
  assert_eq!(embedding.len(), 384, "Fast preset should produce 384-dim embeddings");