kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -7,7 +7,7 @@
7
7
 
8
8
  use async_trait::async_trait;
9
9
  use kreuzberg::core::config::{ExtractionConfig, PostProcessorConfig};
10
- use kreuzberg::core::pipeline::{clear_processor_cache, run_pipeline};
10
+ use kreuzberg::core::pipeline::run_pipeline;
11
11
  use kreuzberg::plugins::registry::get_post_processor_registry;
12
12
  use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
13
13
  use kreuzberg::types::{ExtractionResult, Metadata};
@@ -123,8 +123,6 @@ fn clear_processor_registry() {
123
123
  .write()
124
124
  .expect("Failed to acquire write lock on registry in test");
125
125
  let _ = reg.shutdown_all();
126
- drop(reg);
127
- let _ = clear_processor_cache();
128
126
  }
129
127
 
130
128
  #[tokio::test]
@@ -140,7 +138,6 @@ async fn test_pipeline_empty_no_processors() {
140
138
  detected_languages: None,
141
139
  chunks: None,
142
140
  images: None,
143
- pages: None,
144
141
  };
145
142
  let config = ExtractionConfig::default();
146
143
 
@@ -185,7 +182,6 @@ async fn test_pipeline_single_processor_per_stage() {
185
182
  detected_languages: None,
186
183
  chunks: None,
187
184
  images: None,
188
- pages: None,
189
185
  };
190
186
  let config = ExtractionConfig::default();
191
187
 
@@ -230,7 +226,6 @@ async fn test_pipeline_multiple_processors_per_stage() {
230
226
  detected_languages: None,
231
227
  chunks: None,
232
228
  images: None,
233
- pages: None,
234
229
  };
235
230
  let config = ExtractionConfig::default();
236
231
 
@@ -266,7 +261,6 @@ async fn test_pipeline_all_stages_enabled() {
266
261
  detected_languages: None,
267
262
  chunks: None,
268
263
  images: None,
269
- pages: None,
270
264
  };
271
265
  let config = ExtractionConfig::default();
272
266
 
@@ -300,15 +294,12 @@ async fn test_pipeline_postprocessing_disabled() {
300
294
  detected_languages: None,
301
295
  chunks: None,
302
296
  images: None,
303
- pages: None,
304
297
  };
305
298
  let config = ExtractionConfig {
306
299
  postprocessor: Some(PostProcessorConfig {
307
300
  enabled: false,
308
301
  enabled_processors: None,
309
302
  disabled_processors: None,
310
- enabled_set: None,
311
- disabled_set: None,
312
303
  }),
313
304
  ..Default::default()
314
305
  };
@@ -349,7 +340,6 @@ async fn test_pipeline_early_stage_runs_first() {
349
340
  detected_languages: None,
350
341
  chunks: None,
351
342
  images: None,
352
- pages: None,
353
343
  };
354
344
  let config = ExtractionConfig::default();
355
345
 
@@ -389,7 +379,6 @@ async fn test_pipeline_middle_stage_runs_second() {
389
379
  detected_languages: None,
390
380
  chunks: None,
391
381
  images: None,
392
- pages: None,
393
382
  };
394
383
  let config = ExtractionConfig::default();
395
384
 
@@ -425,7 +414,6 @@ async fn test_pipeline_late_stage_runs_last() {
425
414
  detected_languages: None,
426
415
  chunks: None,
427
416
  images: None,
428
- pages: None,
429
417
  };
430
418
  let config = ExtractionConfig::default();
431
419
 
@@ -461,7 +449,6 @@ async fn test_pipeline_within_stage_priority_order() {
461
449
  detected_languages: None,
462
450
  chunks: None,
463
451
  images: None,
464
- pages: None,
465
452
  };
466
453
  let config = ExtractionConfig::default();
467
454
 
@@ -526,7 +513,6 @@ async fn test_pipeline_cross_stage_data_flow() {
526
513
  detected_languages: None,
527
514
  chunks: None,
528
515
  images: None,
529
- pages: None,
530
516
  };
531
517
  let config = ExtractionConfig::default();
532
518
 
@@ -583,7 +569,6 @@ async fn test_pipeline_early_stage_error_recorded() {
583
569
  detected_languages: None,
584
570
  chunks: None,
585
571
  images: None,
586
- pages: None,
587
572
  };
588
573
  let config = ExtractionConfig::default();
589
574
 
@@ -625,7 +610,6 @@ async fn test_pipeline_middle_stage_error_propagation() {
625
610
  detected_languages: None,
626
611
  chunks: None,
627
612
  images: None,
628
- pages: None,
629
613
  };
630
614
  let config = ExtractionConfig::default();
631
615
 
@@ -697,7 +681,6 @@ async fn test_pipeline_late_stage_error_doesnt_affect_earlier_stages() {
697
681
  detected_languages: None,
698
682
  chunks: None,
699
683
  images: None,
700
- pages: None,
701
684
  };
702
685
  let config = ExtractionConfig::default();
703
686
 
@@ -785,7 +768,6 @@ async fn test_pipeline_processor_error_doesnt_stop_other_processors() {
785
768
  detected_languages: None,
786
769
  chunks: None,
787
770
  images: None,
788
- pages: None,
789
771
  };
790
772
  let config = ExtractionConfig::default();
791
773
 
@@ -863,12 +845,12 @@ async fn test_pipeline_multiple_processor_errors() {
863
845
  detected_languages: None,
864
846
  chunks: None,
865
847
  images: None,
866
- pages: None,
867
848
  };
868
849
  let config = ExtractionConfig::default();
869
850
 
870
851
  let result = run_pipeline(result, &config).await;
871
852
  assert!(result.is_err(), "Expected pipeline to return error");
853
+ // First failing processor (fail1 in Early stage) will cause pipeline to fail
872
854
  match result {
873
855
  Err(KreuzbergError::Plugin { message, plugin_name }) => {
874
856
  assert_eq!(message, "fail1 error");
@@ -905,7 +887,6 @@ async fn test_pipeline_error_context_preservation() {
905
887
  detected_languages: None,
906
888
  chunks: None,
907
889
  images: None,
908
- pages: None,
909
890
  };
910
891
  let config = ExtractionConfig::default();
911
892
 
@@ -977,7 +958,6 @@ async fn test_pipeline_metadata_added_in_early_visible_in_middle() {
977
958
  detected_languages: None,
978
959
  chunks: None,
979
960
  images: None,
980
- pages: None,
981
961
  };
982
962
  let config = ExtractionConfig::default();
983
963
 
@@ -1048,7 +1028,6 @@ async fn test_pipeline_content_modified_in_middle_visible_in_late() {
1048
1028
  detected_languages: None,
1049
1029
  chunks: None,
1050
1030
  images: None,
1051
- pages: None,
1052
1031
  };
1053
1032
  let config = ExtractionConfig::default();
1054
1033
 
@@ -1117,7 +1096,6 @@ async fn test_pipeline_multiple_processors_modifying_same_metadata() {
1117
1096
  detected_languages: None,
1118
1097
  chunks: None,
1119
1098
  images: None,
1120
- pages: None,
1121
1099
  };
1122
1100
  let config = ExtractionConfig::default();
1123
1101
 
@@ -1205,7 +1183,6 @@ async fn test_pipeline_processors_reading_previous_output() {
1205
1183
  detected_languages: None,
1206
1184
  chunks: None,
1207
1185
  images: None,
1208
- pages: None,
1209
1186
  };
1210
1187
  let config = ExtractionConfig::default();
1211
1188
 
@@ -1260,7 +1237,6 @@ async fn test_pipeline_large_content_modification() {
1260
1237
  detected_languages: None,
1261
1238
  chunks: None,
1262
1239
  images: None,
1263
- pages: None,
1264
1240
  };
1265
1241
  let config = ExtractionConfig::default();
1266
1242
 
@@ -1296,15 +1272,12 @@ async fn test_pipeline_enabled_processors_whitelist() {
1296
1272
  detected_languages: None,
1297
1273
  chunks: None,
1298
1274
  images: None,
1299
- pages: None,
1300
1275
  };
1301
1276
  let config = ExtractionConfig {
1302
1277
  postprocessor: Some(PostProcessorConfig {
1303
1278
  enabled: true,
1304
1279
  enabled_processors: Some(vec!["proc1".to_string(), "proc3".to_string()]),
1305
1280
  disabled_processors: None,
1306
- enabled_set: None,
1307
- disabled_set: None,
1308
1281
  }),
1309
1282
  ..Default::default()
1310
1283
  };
@@ -1343,15 +1316,12 @@ async fn test_pipeline_disabled_processors_blacklist() {
1343
1316
  detected_languages: None,
1344
1317
  chunks: None,
1345
1318
  images: None,
1346
- pages: None,
1347
1319
  };
1348
1320
  let config = ExtractionConfig {
1349
1321
  postprocessor: Some(PostProcessorConfig {
1350
1322
  enabled: true,
1351
1323
  enabled_processors: None,
1352
1324
  disabled_processors: Some(vec!["proc2".to_string()]),
1353
- enabled_set: None,
1354
- disabled_set: None,
1355
1325
  }),
1356
1326
  ..Default::default()
1357
1327
  };
@@ -1390,7 +1360,6 @@ async fn test_pipeline_no_filtering_runs_all() {
1390
1360
  detected_languages: None,
1391
1361
  chunks: None,
1392
1362
  images: None,
1393
- pages: None,
1394
1363
  };
1395
1364
  let config = ExtractionConfig::default();
1396
1365
 
@@ -1428,15 +1397,12 @@ async fn test_pipeline_empty_whitelist_runs_none() {
1428
1397
  detected_languages: None,
1429
1398
  chunks: None,
1430
1399
  images: None,
1431
- pages: None,
1432
1400
  };
1433
1401
  let config = ExtractionConfig {
1434
1402
  postprocessor: Some(PostProcessorConfig {
1435
1403
  enabled: true,
1436
1404
  enabled_processors: Some(vec![]),
1437
1405
  disabled_processors: None,
1438
- enabled_set: None,
1439
- disabled_set: None,
1440
1406
  }),
1441
1407
  ..Default::default()
1442
1408
  };
@@ -3,8 +3,6 @@
3
3
  //! Tests custom OCR backend registration, execution, parameter passing,
4
4
  //! error handling, and backend switching with real image extraction.
5
5
 
6
- #![cfg(feature = "ocr")]
7
-
8
6
  use async_trait::async_trait;
9
7
  use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
10
8
  use kreuzberg::plugins::registry::get_ocr_backend_registry;
@@ -62,7 +60,6 @@ impl OcrBackend for MockOcrBackend {
62
60
  detected_languages: None,
63
61
  chunks: None,
64
62
  images: None,
65
- pages: None,
66
63
  })
67
64
  }
68
65
 
@@ -158,7 +155,6 @@ impl OcrBackend for ValidatingOcrBackend {
158
155
  detected_languages: None,
159
156
  chunks: None,
160
157
  images: None,
161
- pages: None,
162
158
  })
163
159
  }
164
160
 
@@ -215,7 +211,6 @@ impl OcrBackend for MetadataOcrBackend {
215
211
  detected_languages: None,
216
212
  chunks: None,
217
213
  images: None,
218
- pages: None,
219
214
  })
220
215
  }
221
216
 
@@ -5,7 +5,6 @@
5
5
 
6
6
  use async_trait::async_trait;
7
7
  use kreuzberg::core::config::ExtractionConfig;
8
- use kreuzberg::core::pipeline::clear_processor_cache;
9
8
  use kreuzberg::plugins::registry::get_post_processor_registry;
10
9
  use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
11
10
  use kreuzberg::types::ExtractionResult;
@@ -165,16 +164,6 @@ impl PostProcessor for FailingProcessor {
165
164
  }
166
165
  }
167
166
 
168
- fn clear_processor_registry_and_cache() {
169
- let registry = get_post_processor_registry();
170
- let mut reg = registry
171
- .write()
172
- .expect("Failed to acquire write lock on registry in test");
173
- let _ = reg.shutdown_all();
174
- drop(reg);
175
- let _ = clear_processor_cache();
176
- }
177
-
178
167
  #[serial]
179
168
  #[test]
180
169
  fn test_register_custom_postprocessor() {
@@ -213,7 +202,6 @@ fn test_register_custom_postprocessor() {
213
202
  #[serial]
214
203
  #[test]
215
204
  fn test_postprocessor_called_during_extraction() {
216
- clear_processor_registry_and_cache();
217
205
  let test_file = "../../test_documents/text/fake_text.txt";
218
206
  let registry = get_post_processor_registry();
219
207
 
@@ -256,7 +244,6 @@ fn test_postprocessor_called_during_extraction() {
256
244
  #[serial]
257
245
  #[test]
258
246
  fn test_postprocessor_modifies_content() {
259
- clear_processor_registry_and_cache();
260
247
  let test_file = "../../test_documents/text/fake_text.txt";
261
248
  let registry = get_post_processor_registry();
262
249
 
@@ -288,7 +275,6 @@ fn test_postprocessor_modifies_content() {
288
275
  #[serial]
289
276
  #[test]
290
277
  fn test_postprocessor_adds_metadata() {
291
- clear_processor_registry_and_cache();
292
278
  let test_file = "../../test_documents/text/fake_text.txt";
293
279
  let registry = get_post_processor_registry();
294
280
 
@@ -431,7 +417,6 @@ fn test_clear_all_postprocessors() {
431
417
  #[serial]
432
418
  #[test]
433
419
  fn test_postprocessor_error_handling() {
434
- clear_processor_registry_and_cache();
435
420
  let test_file = "../../test_documents/text/fake_text.txt";
436
421
  let registry = get_post_processor_registry();
437
422
 
@@ -448,6 +433,7 @@ fn test_postprocessor_error_handling() {
448
433
  let result = extract_file_sync(test_file, None, &config);
449
434
 
450
435
  // NOTE: Plugin errors now bubble up and fail the extraction (design change)
436
+ // Other error types (non-IO, non-Plugin) are caught and recorded in metadata
451
437
  assert!(
452
438
  result.is_err(),
453
439
  "Extraction should fail when postprocessor returns Plugin error"
@@ -500,7 +486,6 @@ fn test_postprocessor_invalid_name() {
500
486
  #[serial]
501
487
  #[test]
502
488
  fn test_multiple_postprocessors_execution_order() {
503
- clear_processor_registry_and_cache();
504
489
  let test_file = "../../test_documents/text/fake_text.txt";
505
490
  let registry = get_post_processor_registry();
506
491
 
@@ -546,7 +531,6 @@ fn test_multiple_postprocessors_execution_order() {
546
531
  #[serial]
547
532
  #[test]
548
533
  fn test_postprocessor_preserves_mime_type() {
549
- clear_processor_registry_and_cache();
550
534
  let test_file = "../../test_documents/text/fake_text.txt";
551
535
  let registry = get_post_processor_registry();
552
536
 
@@ -58,7 +58,6 @@ impl DocumentExtractor for FailingExtractor {
58
58
  detected_languages: None,
59
59
  chunks: None,
60
60
  images: None,
61
- pages: None,
62
61
  })
63
62
  }
64
63
  }
@@ -303,7 +302,6 @@ fn test_extractor_priority_ordering_complex() {
303
302
  detected_languages: None,
304
303
  chunks: None,
305
304
  images: None,
306
- pages: None,
307
305
  })
308
306
  }
309
307
  fn supported_mime_types(&self) -> &[&str] {
@@ -463,7 +461,6 @@ async fn test_processor_execution_order_within_stage() {
463
461
  detected_languages: None,
464
462
  chunks: None,
465
463
  images: None,
466
- pages: None,
467
464
  };
468
465
 
469
466
  let config = ExtractionConfig::default();
@@ -495,7 +492,6 @@ async fn test_processor_error_propagation() {
495
492
  detected_languages: None,
496
493
  chunks: None,
497
494
  images: None,
498
- pages: None,
499
495
  };
500
496
 
501
497
  let config = ExtractionConfig::default();
@@ -667,7 +663,6 @@ async fn test_validator_content_validation() {
667
663
  detected_languages: None,
668
664
  chunks: None,
669
665
  images: None,
670
- pages: None,
671
666
  };
672
667
 
673
668
  let validation = validators[0].validate(&short_result, &config).await;
@@ -681,7 +676,6 @@ async fn test_validator_content_validation() {
681
676
  detected_languages: None,
682
677
  chunks: None,
683
678
  images: None,
684
- pages: None,
685
679
  };
686
680
 
687
681
  let validation = validators[0].validate(&long_result, &config).await;
@@ -15,6 +15,8 @@ use kreuzberg::{KreuzbergError, Result};
15
15
  use std::path::Path;
16
16
  use std::sync::Arc;
17
17
 
18
+ // ===== Mock Validators =====
19
+
18
20
  struct MockValidator {
19
21
  name: String,
20
22
  should_fail: bool,
@@ -85,6 +87,8 @@ impl Validator for FailingInitValidator {
85
87
  }
86
88
  }
87
89
 
90
+ // ===== Mock Extractors =====
91
+
88
92
  struct MockExtractor {
89
93
  name: String,
90
94
  mime_types: Vec<&'static str>,
@@ -125,7 +129,6 @@ impl DocumentExtractor for MockExtractor {
125
129
  detected_languages: None,
126
130
  chunks: None,
127
131
  images: None,
128
- pages: None,
129
132
  })
130
133
  }
131
134
 
@@ -143,6 +146,8 @@ impl DocumentExtractor for MockExtractor {
143
146
  }
144
147
  }
145
148
 
149
+ // ===== Validator Registry Tests =====
150
+
146
151
  /// Test validator registration and listing.
147
152
  #[test]
148
153
  fn test_validator_registration_succeeds() {
@@ -275,10 +280,13 @@ fn test_validator_registration_with_failed_init_fails() {
275
280
  assert!(result.is_err(), "Registration with failed init should fail");
276
281
 
277
282
  match result {
278
- Err(KreuzbergError::Plugin { .. }) => {}
283
+ Err(KreuzbergError::Plugin { .. }) => {
284
+ // Expected error type
285
+ }
279
286
  _ => panic!("Expected Plugin error"),
280
287
  }
281
288
 
289
+ // Validator should not be in the list
282
290
  assert_eq!(registry.list().len(), 0, "Failed validator should not be registered");
283
291
  }
284
292
 
@@ -287,6 +295,7 @@ fn test_validator_registration_with_failed_init_fails() {
287
295
  fn test_clear_validators_succeeds() {
288
296
  let mut registry = ValidatorRegistry::new();
289
297
 
298
+ // Register multiple validators
290
299
  let v1 = Arc::new(MockValidator {
291
300
  name: "validator-1".to_string(),
292
301
  should_fail: false,
@@ -300,6 +309,7 @@ fn test_clear_validators_succeeds() {
300
309
  registry.register(v2).unwrap();
301
310
  assert_eq!(registry.list().len(), 2);
302
311
 
312
+ // Clear all
303
313
  let result = registry.shutdown_all();
304
314
  assert!(result.is_ok(), "Clear should succeed");
305
315
  assert_eq!(registry.list().len(), 0, "Registry should be empty after clear");
@@ -360,11 +370,14 @@ fn test_get_all_validators_respects_priority() {
360
370
  let all = registry.get_all();
361
371
  assert_eq!(all.len(), 3, "Should have three validators");
362
372
 
373
+ // Should be in descending priority order
363
374
  assert_eq!(all[0].name(), "high-priority");
364
375
  assert_eq!(all[1].name(), "medium-priority");
365
376
  assert_eq!(all[2].name(), "low-priority");
366
377
  }
367
378
 
379
+ // ===== Extractor Registry Tests =====
380
+
368
381
  /// Test extractor registration and retrieval.
369
382
  #[test]
370
383
  fn test_extractor_registration_succeeds() {
@@ -438,6 +451,7 @@ fn test_extractor_priority_selection() {
438
451
  registry.register(low_priority).unwrap();
439
452
  registry.register(high_priority).unwrap();
440
453
 
454
+ // Should get the high priority extractor
441
455
  let result = registry.get("text/plain").unwrap();
442
456
  assert_eq!(
443
457
  result.name(),
@@ -459,14 +473,17 @@ fn test_extractor_wildcard_mime_matching() {
459
473
 
460
474
  registry.register(extractor).unwrap();
461
475
 
476
+ // Should match text/plain
462
477
  let result = registry.get("text/plain");
463
478
  assert!(result.is_ok(), "Should match text/plain with text/*");
464
479
  assert_eq!(result.unwrap().name(), "text-extractor");
465
480
 
481
+ // Should match text/html
466
482
  let result = registry.get("text/html");
467
483
  assert!(result.is_ok(), "Should match text/html with text/*");
468
484
  assert_eq!(result.unwrap().name(), "text-extractor");
469
485
 
486
+ // Should not match application/pdf
470
487
  let result = registry.get("application/pdf");
471
488
  assert!(result.is_err(), "Should not match application/pdf with text/*");
472
489
  }
@@ -489,6 +506,7 @@ fn test_extractor_unregistration_succeeds() {
489
506
  assert!(result.is_ok(), "Unregistration should succeed");
490
507
  assert_eq!(registry.list().len(), 0, "Registry should be empty after removal");
491
508
 
509
+ // Should no longer find extractor for MIME type
492
510
  let lookup_result = registry.get("text/plain");
493
511
  assert!(lookup_result.is_err(), "Should not find extractor after removal");
494
512
  }
@@ -506,10 +524,12 @@ fn test_extractor_multiple_mime_types() {
506
524
 
507
525
  registry.register(extractor).unwrap();
508
526
 
527
+ // Should find for all MIME types
509
528
  assert!(registry.get("application/pdf").is_ok());
510
529
  assert!(registry.get("application/vnd.ms-excel").is_ok());
511
530
  assert!(registry.get("text/csv").is_ok());
512
531
 
532
+ // All should return the same extractor
513
533
  assert_eq!(
514
534
  registry.get("application/pdf").unwrap().name(),
515
535
  "multi-format-extractor"
@@ -12,17 +12,6 @@ use kreuzberg::core::extractor::{extract_bytes_sync, extract_file_sync};
12
12
  use std::io::Write;
13
13
  use tempfile::NamedTempFile;
14
14
 
15
- fn trim_trailing_newlines(value: &str) -> &str {
16
- value.trim_end_matches(['\n', '\r'])
17
- }
18
-
19
- fn assert_text_content(actual: &str, expected: &str) {
20
- assert_eq!(
21
- trim_trailing_newlines(actual),
22
- expected,
23
- "Content mismatch after trimming trailing newlines"
24
- );
25
- }
26
15
  #[test]
27
16
  fn test_archive_zip_bomb_detection() {
28
17
  let mut cursor = std::io::Cursor::new(Vec::new());
@@ -140,7 +129,6 @@ fn test_archive_deeply_nested_directories() {
140
129
  }
141
130
 
142
131
  #[test]
143
- #[cfg(feature = "archives")]
144
132
  fn test_archive_many_small_files() {
145
133
  let mut cursor = std::io::Cursor::new(Vec::new());
146
134
  {
@@ -278,7 +266,7 @@ fn test_resource_single_byte_file() {
278
266
 
279
267
  assert!(result.is_ok());
280
268
  if let Ok(extracted) = result {
281
- assert_text_content(&extracted.content, "a");
269
+ assert_eq!(extracted.content, "a");
282
270
  }
283
271
  }
284
272