kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -16,7 +16,7 @@
16
16
  //! use kreuzberg::mcp::start_mcp_server;
17
17
  //!
18
18
  //! #[tokio::main]
19
- //! async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
19
+ //! async fn main() -> anyhow::Result<()> {
20
20
  //! start_mcp_server().await?;
21
21
  //! Ok(())
22
22
  //! }
@@ -26,9 +26,6 @@ mod server;
26
26
 
27
27
  pub use server::{start_mcp_server, start_mcp_server_with_config};
28
28
 
29
- #[cfg(feature = "mcp-http")]
30
- pub use server::{start_mcp_server_http, start_mcp_server_http_with_config};
31
-
32
29
  pub use server::{BatchExtractFilesParams, DetectMimeTypeParams, ExtractBytesParams, ExtractFileParams, KreuzbergMcp};
33
30
 
34
31
  #[doc(hidden)]
@@ -12,9 +12,6 @@ use rmcp::{
12
12
  transport::stdio,
13
13
  };
14
14
 
15
- #[cfg(feature = "mcp-http")]
16
- use rmcp::transport::streamable_http_server::{StreamableHttpService, session::local::LocalSessionManager};
17
-
18
15
  use crate::{
19
16
  ExtractionConfig, ExtractionResult as KreuzbergResult, KreuzbergError, batch_extract_file, batch_extract_file_sync,
20
17
  cache, detect_mime_type, extract_bytes, extract_bytes_sync, extract_file, extract_file_sync,
@@ -229,8 +226,7 @@ impl KreuzbergMcp {
229
226
  /// This tool extracts text, metadata, and tables from documents in various formats
230
227
  /// including PDFs, Word documents, Excel spreadsheets, images (with OCR), and more.
231
228
  #[tool(
232
- description = "Extract content from a file by path. Supports PDFs, Word, Excel, images (with OCR), HTML, and more.",
233
- annotations(title = "Extract File", read_only_hint = true, idempotent_hint = true)
229
+ description = "Extract content from a file by path. Supports PDFs, Word, Excel, images (with OCR), HTML, and more."
234
230
  )]
235
231
  async fn extract_file(
236
232
  &self,
@@ -254,8 +250,7 @@ impl KreuzbergMcp {
254
250
  ///
255
251
  /// This tool extracts text, metadata, and tables from base64-encoded document data.
256
252
  #[tool(
257
- description = "Extract content from base64-encoded file data. Returns extracted text, metadata, and tables.",
258
- annotations(title = "Extract Bytes", read_only_hint = true, idempotent_hint = true)
253
+ description = "Extract content from base64-encoded file data. Returns extracted text, metadata, and tables."
259
254
  )]
260
255
  async fn extract_bytes(
261
256
  &self,
@@ -284,10 +279,7 @@ impl KreuzbergMcp {
284
279
  /// Extract content from multiple files in parallel.
285
280
  ///
286
281
  /// This tool efficiently processes multiple documents simultaneously, useful for batch operations.
287
- #[tool(
288
- description = "Extract content from multiple files in parallel. Returns results for all files.",
289
- annotations(title = "Batch Extract Files", read_only_hint = true, idempotent_hint = true)
290
- )]
282
+ #[tool(description = "Extract content from multiple files in parallel. Returns results for all files.")]
291
283
  async fn batch_extract_files(
292
284
  &self,
293
285
  Parameters(params): Parameters<BatchExtractFilesParams>,
@@ -315,10 +307,7 @@ impl KreuzbergMcp {
315
307
  /// Detect the MIME type of a file.
316
308
  ///
317
309
  /// This tool identifies the file format, useful for determining which extractor to use.
318
- #[tool(
319
- description = "Detect the MIME type of a file. Returns the detected MIME type string.",
320
- annotations(title = "Detect MIME Type", read_only_hint = true, idempotent_hint = true)
321
- )]
310
+ #[tool(description = "Detect the MIME type of a file. Returns the detected MIME type string.")]
322
311
  fn detect_mime_type(
323
312
  &self,
324
313
  Parameters(params): Parameters<DetectMimeTypeParams>,
@@ -331,10 +320,7 @@ impl KreuzbergMcp {
331
320
  /// Get cache statistics.
332
321
  ///
333
322
  /// This tool returns statistics about the cache including total files, size, and disk space.
334
- #[tool(
335
- description = "Get cache statistics including total files, size, and available disk space.",
336
- annotations(title = "Cache Stats", read_only_hint = true, idempotent_hint = true)
337
- )]
323
+ #[tool(description = "Get cache statistics including total files, size, and available disk space.")]
338
324
  fn cache_stats(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
339
325
  let cache_dir = std::env::current_dir()
340
326
  .unwrap_or_else(|_| std::path::PathBuf::from("."))
@@ -365,10 +351,7 @@ impl KreuzbergMcp {
365
351
  /// Clear the cache.
366
352
  ///
367
353
  /// This tool removes all cached files and returns the number of files removed and space freed.
368
- #[tool(
369
- description = "Clear all cached files. Returns the number of files removed and space freed in MB.",
370
- annotations(title = "Clear Cache", destructive_hint = true)
371
- )]
354
+ #[tool(description = "Clear all cached files. Returns the number of files removed and space freed in MB.")]
372
355
  fn cache_clear(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
373
356
  let cache_dir = std::env::current_dir()
374
357
  .unwrap_or_else(|_| std::path::PathBuf::from("."))
@@ -445,12 +428,12 @@ impl Default for KreuzbergMcp {
445
428
  /// use kreuzberg::mcp::start_mcp_server;
446
429
  ///
447
430
  /// #[tokio::main]
448
- /// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
431
+ /// async fn main() -> anyhow::Result<()> {
449
432
  /// start_mcp_server().await?;
450
433
  /// Ok(())
451
434
  /// }
452
435
  /// ```
453
- pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
436
+ pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error>> {
454
437
  let service = KreuzbergMcp::new()?.serve(stdio()).await?;
455
438
 
456
439
  service.waiting().await?;
@@ -461,118 +444,13 @@ pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error + Send +
461
444
  ///
462
445
  /// This variant allows specifying a custom extraction configuration
463
446
  /// (e.g., loaded from a file) instead of using defaults.
464
- pub async fn start_mcp_server_with_config(
465
- config: ExtractionConfig,
466
- ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
447
+ pub async fn start_mcp_server_with_config(config: ExtractionConfig) -> Result<(), Box<dyn std::error::Error>> {
467
448
  let service = KreuzbergMcp::with_config(config).serve(stdio()).await?;
468
449
 
469
450
  service.waiting().await?;
470
451
  Ok(())
471
452
  }
472
453
 
473
- /// Start MCP server with HTTP Stream transport.
474
- ///
475
- /// Uses rmcp's built-in StreamableHttpService for HTTP/SSE support per MCP spec.
476
- ///
477
- /// # Arguments
478
- ///
479
- /// * `host` - Host to bind to (e.g., "127.0.0.1" or "0.0.0.0")
480
- /// * `port` - Port number (e.g., 8001)
481
- ///
482
- /// # Example
483
- ///
484
- /// ```no_run
485
- /// use kreuzberg::mcp::start_mcp_server_http;
486
- ///
487
- /// #[tokio::main]
488
- /// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
489
- /// start_mcp_server_http("127.0.0.1", 8001).await?;
490
- /// Ok(())
491
- /// }
492
- /// ```
493
- #[cfg(feature = "mcp-http")]
494
- pub async fn start_mcp_server_http(
495
- host: impl AsRef<str>,
496
- port: u16,
497
- ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
498
- use axum::Router;
499
- use std::net::SocketAddr;
500
-
501
- let http_service = StreamableHttpService::new(
502
- || KreuzbergMcp::new().map_err(|e| std::io::Error::other(e.to_string())),
503
- LocalSessionManager::default().into(),
504
- Default::default(),
505
- );
506
-
507
- let router = Router::new().nest_service("/mcp", http_service);
508
-
509
- let addr: SocketAddr = format!("{}:{}", host.as_ref(), port)
510
- .parse()
511
- .map_err(|e| format!("Invalid address: {}", e))?;
512
-
513
- #[cfg(feature = "api")]
514
- tracing::info!("Starting MCP HTTP server on http://{}", addr);
515
-
516
- let listener = tokio::net::TcpListener::bind(addr).await?;
517
- axum::serve(listener, router).await?;
518
-
519
- Ok(())
520
- }
521
-
522
- /// Start MCP HTTP server with custom extraction config.
523
- ///
524
- /// This variant allows specifying a custom extraction configuration
525
- /// while using HTTP Stream transport.
526
- ///
527
- /// # Arguments
528
- ///
529
- /// * `host` - Host to bind to (e.g., "127.0.0.1" or "0.0.0.0")
530
- /// * `port` - Port number (e.g., 8001)
531
- /// * `config` - Custom extraction configuration
532
- ///
533
- /// # Example
534
- ///
535
- /// ```no_run
536
- /// use kreuzberg::mcp::start_mcp_server_http_with_config;
537
- /// use kreuzberg::ExtractionConfig;
538
- ///
539
- /// #[tokio::main]
540
- /// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
541
- /// let config = ExtractionConfig::default();
542
- /// start_mcp_server_http_with_config("127.0.0.1", 8001, config).await?;
543
- /// Ok(())
544
- /// }
545
- /// ```
546
- #[cfg(feature = "mcp-http")]
547
- pub async fn start_mcp_server_http_with_config(
548
- host: impl AsRef<str>,
549
- port: u16,
550
- config: ExtractionConfig,
551
- ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
552
- use axum::Router;
553
- use std::net::SocketAddr;
554
-
555
- let http_service = StreamableHttpService::new(
556
- move || Ok(KreuzbergMcp::with_config(config.clone())),
557
- LocalSessionManager::default().into(),
558
- Default::default(),
559
- );
560
-
561
- let router = Router::new().nest_service("/mcp", http_service);
562
-
563
- let addr: SocketAddr = format!("{}:{}", host.as_ref(), port)
564
- .parse()
565
- .map_err(|e| format!("Invalid address: {}", e))?;
566
-
567
- #[cfg(feature = "api")]
568
- tracing::info!("Starting MCP HTTP server on http://{}", addr);
569
-
570
- let listener = tokio::net::TcpListener::bind(addr).await?;
571
- axum::serve(listener, router).await?;
572
-
573
- Ok(())
574
- }
575
-
576
454
  /// Build extraction config from MCP parameters.
577
455
  ///
578
456
  /// Starts with the default config and overlays OCR settings from request parameters.
@@ -870,7 +748,6 @@ mod tests {
870
748
  detected_languages: None,
871
749
  chunks: None,
872
750
  images: None,
873
- pages: None,
874
751
  };
875
752
 
876
753
  let formatted = format_extraction_result(&result);
@@ -907,7 +784,6 @@ mod tests {
907
784
  detected_languages: None,
908
785
  chunks: None,
909
786
  images: None,
910
- pages: None,
911
787
  };
912
788
 
913
789
  let formatted = format_extraction_result(&result);
@@ -929,7 +805,6 @@ mod tests {
929
805
  detected_languages: None,
930
806
  chunks: None,
931
807
  images: None,
932
- pages: None,
933
808
  };
934
809
 
935
810
  let formatted = format_extraction_result(&result);
@@ -948,7 +823,6 @@ mod tests {
948
823
  detected_languages: None,
949
824
  chunks: None,
950
825
  images: None,
951
- pages: None,
952
826
  };
953
827
 
954
828
  let formatted = format_extraction_result(&result);
@@ -1746,17 +1620,19 @@ mod tests {
1746
1620
 
1747
1621
  let result = server.batch_extract_files(Parameters(params)).await;
1748
1622
 
1749
- if let Ok(call_result) = result
1750
- && let Some(content) = call_result.content.first()
1751
- && let RawContent::Text(text) = &content.raw
1752
- {
1753
- assert!(text.text.contains("Document 1"));
1754
- assert!(text.text.contains("Document 2"));
1623
+ if result.is_ok() {
1624
+ let call_result = result.unwrap();
1625
+ if let Some(content) = call_result.content.first()
1626
+ && let RawContent::Text(text) = &content.raw
1627
+ {
1628
+ assert!(text.text.contains("Document 1"));
1629
+ assert!(text.text.contains("Document 2"));
1755
1630
 
1756
- let doc1_pos = text.text.find("Document 1");
1757
- let doc2_pos = text.text.find("Document 2");
1758
- if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
1759
- assert!(pos1 < pos2, "Documents should be in order");
1631
+ let doc1_pos = text.text.find("Document 1");
1632
+ let doc2_pos = text.text.find("Document 2");
1633
+ if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
1634
+ assert!(pos1 < pos2, "Documents should be in order");
1635
+ }
1760
1636
  }
1761
1637
  }
1762
1638
  }
@@ -40,7 +40,6 @@
40
40
  pub mod cache;
41
41
  pub mod error;
42
42
  pub mod hocr;
43
- pub mod language_registry;
44
43
  pub mod processor;
45
44
  pub mod table;
46
45
  pub mod tesseract_backend;
@@ -51,7 +50,6 @@ pub mod validation;
51
50
  pub use cache::{OcrCache, OcrCacheStats};
52
51
  pub use error::OcrError;
53
52
  pub use hocr::convert_hocr_to_markdown;
54
- pub use language_registry::LanguageRegistry;
55
53
  pub use processor::OcrProcessor;
56
54
  pub use table::{HocrWord, extract_words_from_tsv, reconstruct_table, table_to_markdown};
57
55
  pub use tesseract_backend::TesseractBackend;
@@ -51,14 +51,6 @@ impl OcrProcessor {
51
51
  Ok(Self { cache })
52
52
  }
53
53
 
54
- #[cfg_attr(feature = "otel", tracing::instrument(
55
- skip(self, image_bytes),
56
- fields(
57
- ocr.backend = "tesseract",
58
- ocr.language = %config.language,
59
- image.size_bytes = image_bytes.len(),
60
- )
61
- ))]
62
54
  pub fn process_image(&self, image_bytes: &[u8], config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
63
55
  config.validate().map_err(OcrError::InvalidConfiguration)?;
64
56
 
@@ -72,14 +64,9 @@ impl OcrProcessor {
72
64
  if config.use_cache
73
65
  && let Some(cached_result) = self.cache.get_cached_result(&image_hash, "tesseract", &config_str)?
74
66
  {
75
- #[cfg(feature = "otel")]
76
- tracing::Span::current().record("cache.hit", true);
77
67
  return Ok(cached_result);
78
68
  }
79
69
 
80
- #[cfg(feature = "otel")]
81
- tracing::Span::current().record("cache.hit", false);
82
-
83
70
  let result = self.perform_ocr(image_bytes, config)?;
84
71
 
85
72
  if config.use_cache {
@@ -241,6 +228,7 @@ impl OcrProcessor {
241
228
  });
242
229
 
243
230
  // Validate language before initializing to prevent segfault ~keep
231
+ // tesseract-rs can crash on empty language or missing language files
244
232
  if config.language.trim().is_empty() {
245
233
  return Err(OcrError::TesseractInitializationFailed(
246
234
  "Language cannot be empty. Please specify a valid language code (e.g., 'eng')".to_string(),
@@ -248,6 +236,7 @@ impl OcrProcessor {
248
236
  }
249
237
 
250
238
  // Validate language file exists before initializing to prevent segfault ~keep
239
+ // tesseract-rs can crash if language file is missing instead of returning error
251
240
  if !tessdata_path.is_empty() {
252
241
  let languages: Vec<&str> = config.language.split('+').collect();
253
242
  for lang in languages {
@@ -373,11 +362,6 @@ impl OcrProcessor {
373
362
  )
374
363
  });
375
364
 
376
- api.recognize()
377
- .map_err(|e| OcrError::ProcessingFailed(format!("Failed to recognize text: {}", e)))?;
378
-
379
- log_ci_debug(ci_debug_enabled, "recognize", || "completed".to_string());
380
-
381
365
  let tsv_data_for_tables = if config.enable_table_detection || config.output_format == "tsv" {
382
366
  Some(
383
367
  api.get_tsv_text(0)
@@ -453,7 +437,12 @@ impl OcrProcessor {
453
437
  let words = extract_words_from_tsv(&tsv_data, config.table_min_confidence)?;
454
438
 
455
439
  if !words.is_empty() {
456
- let table = reconstruct_table(&words, config.table_column_threshold, config.table_row_threshold_ratio);
440
+ let table = reconstruct_table(
441
+ &words,
442
+ config.table_column_threshold,
443
+ config.table_row_threshold_ratio,
444
+ true,
445
+ );
457
446
  if !table.is_empty() {
458
447
  metadata.insert("table_count".to_string(), serde_json::Value::String("1".to_string()));
459
448
  metadata.insert(
@@ -161,7 +161,6 @@ impl OcrBackend for TesseractBackend {
161
161
  content: ocr_result.content,
162
162
  mime_type: ocr_result.mime_type,
163
163
  metadata,
164
- pages: None,
165
164
  tables: ocr_result
166
165
  .tables
167
166
  .into_iter()
@@ -215,7 +214,6 @@ impl OcrBackend for TesseractBackend {
215
214
  content: ocr_result.content,
216
215
  mime_type: ocr_result.mime_type,
217
216
  metadata,
218
- pages: None,
219
217
  tables: ocr_result
220
218
  .tables
221
219
  .into_iter()
@@ -10,8 +10,6 @@ pub enum PdfError {
10
10
  TextExtractionFailed(String),
11
11
  RenderingFailed(String),
12
12
  MetadataExtractionFailed(String),
13
- ExtractionFailed(String),
14
- FontLoadingFailed(String),
15
13
  IOError(String),
16
14
  }
17
15
 
@@ -30,8 +28,6 @@ impl fmt::Display for PdfError {
30
28
  PdfError::MetadataExtractionFailed(msg) => {
31
29
  write!(f, "Metadata extraction failed: {}", msg)
32
30
  }
33
- PdfError::ExtractionFailed(msg) => write!(f, "Extraction failed: {}", msg),
34
- PdfError::FontLoadingFailed(msg) => write!(f, "Font loading failed: {}", msg),
35
31
  PdfError::IOError(msg) => write!(f, "I/O error: {}", msg),
36
32
  }
37
33
  }
@@ -44,7 +40,7 @@ impl std::error::Error for PdfError {}
44
40
  impl From<lopdf::Error> for PdfError {
45
41
  fn from(err: lopdf::Error) -> Self {
46
42
  match err {
47
- lopdf::Error::IO(io_err) => PdfError::IOError(io_err.to_string()),
43
+ lopdf::Error::IO(_) => panic!("lopdf IO errors should not be converted to PdfError - let them bubble up"),
48
44
  _ => PdfError::InvalidPdf(err.to_string()),
49
45
  }
50
46
  }
@@ -52,30 +48,6 @@ impl From<lopdf::Error> for PdfError {
52
48
 
53
49
  pub type Result<T> = std::result::Result<T, PdfError>;
54
50
 
55
- /// Format a pdfium error for display.
56
- ///
57
- /// The kreuzberg-pdfium-render fork's error type doesn't implement Display,
58
- /// so Debug formatting produces messages like "PdfiumLibraryInternalError(FormatError,)"
59
- /// with trailing commas and parentheses. This function cleans up the formatting.
60
- pub(crate) fn format_pdfium_error<E: std::fmt::Debug>(error: E) -> String {
61
- let debug_msg = format!("{:?}", error);
62
-
63
- if let Some(paren_idx) = debug_msg.find('(') {
64
- let variant = &debug_msg[..paren_idx];
65
- let inner = &debug_msg[paren_idx + 1..];
66
-
67
- let inner_clean = inner.trim_end_matches(')').trim_end_matches(',');
68
-
69
- if inner_clean.is_empty() {
70
- variant.to_string()
71
- } else {
72
- format!("{}: {}", variant, inner_clean)
73
- }
74
- } else {
75
- debug_msg
76
- }
77
- }
78
-
79
51
  #[cfg(test)]
80
52
  mod tests {
81
53
  use super::*;
@@ -147,68 +119,4 @@ mod tests {
147
119
  let err2 = err1.clone();
148
120
  assert_eq!(err1.to_string(), err2.to_string());
149
121
  }
150
-
151
- #[test]
152
- fn test_extraction_failed_error() {
153
- let err = PdfError::ExtractionFailed("page data mismatch".to_string());
154
- assert_eq!(err.to_string(), "Extraction failed: page data mismatch");
155
- }
156
-
157
- #[test]
158
- fn test_font_loading_failed_error() {
159
- let err = PdfError::FontLoadingFailed("missing font file".to_string());
160
- assert_eq!(err.to_string(), "Font loading failed: missing font file");
161
- }
162
-
163
- #[test]
164
- fn test_format_pdfium_error_with_inner_value() {
165
- #[derive(Debug)]
166
- #[allow(dead_code)]
167
- struct MockError(String);
168
-
169
- let error = MockError("FormatError,".to_string());
170
- let formatted = format_pdfium_error(error);
171
- assert!(formatted.contains("MockError"));
172
- assert!(formatted.contains("FormatError"));
173
- }
174
-
175
- #[test]
176
- fn test_format_pdfium_error_simple() {
177
- #[derive(Debug)]
178
- struct SimpleError;
179
-
180
- let formatted = format_pdfium_error(SimpleError);
181
- assert_eq!(formatted, "SimpleError");
182
- }
183
-
184
- #[test]
185
- fn test_format_pdfium_error_empty_inner() {
186
- #[derive(Debug)]
187
- struct EmptyInner;
188
-
189
- let formatted = format_pdfium_error(EmptyInner);
190
- assert_eq!(formatted, "EmptyInner");
191
- }
192
-
193
- #[test]
194
- fn test_format_pdfium_error_cleans_trailing_comma() {
195
- #[derive(Debug)]
196
- #[allow(dead_code)]
197
- enum PdfiumError {
198
- PdfiumLibraryInternalError(InternalError),
199
- }
200
-
201
- #[derive(Debug)]
202
- #[allow(dead_code)]
203
- enum InternalError {
204
- FormatError,
205
- }
206
-
207
- let error = PdfiumError::PdfiumLibraryInternalError(InternalError::FormatError);
208
- let formatted = format_pdfium_error(error);
209
-
210
- assert!(!formatted.contains(",)"));
211
- assert!(formatted.contains("PdfiumLibraryInternalError"));
212
- assert!(formatted.contains("FormatError"));
213
- }
214
122
  }