kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -3,28 +3,19 @@
3
3
  use crate::Result;
4
4
  use crate::core::config::ExtractionConfig;
5
5
  use crate::plugins::{DocumentExtractor, Plugin};
6
- use crate::types::{ExtractionResult, Metadata, PageContent};
6
+ use crate::types::{ExtractionResult, Metadata};
7
7
  use async_trait::async_trait;
8
- #[cfg(feature = "tokio-runtime")]
9
8
  use std::path::Path;
10
9
 
11
10
  #[cfg(feature = "pdf")]
12
11
  use crate::pdf::error::PdfError;
13
12
  #[cfg(feature = "ocr")]
14
13
  use crate::pdf::rendering::{PageRenderOptions, PdfRenderer};
15
- #[cfg(feature = "pdf")]
14
+ #[cfg(all(feature = "pdf", feature = "ocr"))]
16
15
  use crate::types::Table;
17
16
  #[cfg(feature = "pdf")]
18
17
  use pdfium_render::prelude::*;
19
18
 
20
- #[cfg(feature = "pdf")]
21
- type PdfExtractionPhaseResult = (
22
- crate::pdf::metadata::PdfExtractionMetadata,
23
- String,
24
- Vec<Table>,
25
- Option<Vec<PageContent>>,
26
- );
27
-
28
19
  #[cfg(feature = "ocr")]
29
20
  const MIN_TOTAL_NON_WHITESPACE: usize = 64;
30
21
  #[cfg(feature = "ocr")]
@@ -146,37 +137,41 @@ fn evaluate_native_text_for_ocr(native_text: &str, page_count: Option<usize>) ->
146
137
  ///
147
138
  /// This function converts PDF character positions to HocrWord format,
148
139
  /// then uses the existing table reconstruction logic to detect tables.
149
- ///
150
- /// Uses the shared PdfDocument reference (wrapped in Arc<RwLock<>> for thread-safety).
151
140
  #[cfg(all(feature = "pdf", feature = "ocr"))]
152
141
  fn extract_tables_from_document(
153
142
  document: &PdfDocument,
154
- _metadata: &crate::pdf::metadata::PdfExtractionMetadata,
143
+ _metadata: &crate::pdf::metadata::PdfMetadata,
155
144
  ) -> Result<Vec<Table>> {
156
145
  use crate::ocr::table::{reconstruct_table, table_to_markdown};
157
146
  use crate::pdf::table::extract_words_from_page;
158
147
 
159
148
  let mut all_tables = Vec::new();
160
149
 
150
+ // Process each page
161
151
  for (page_index, page) in document.pages().iter().enumerate() {
162
- let words = extract_words_from_page(&page, 0.0)?;
152
+ // Extract words with positions from the page
153
+ let words = extract_words_from_page(&page, 0.0)?; // Use 0.0 confidence for PDF (always high quality)
163
154
 
164
155
  if words.is_empty() {
165
156
  continue;
166
157
  }
167
158
 
159
+ // Use existing table reconstruction logic
160
+ // These thresholds match the defaults from TesseractConfig
168
161
  let column_threshold = 50;
169
162
  let row_threshold_ratio = 0.5;
170
163
 
171
- let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio);
164
+ // Reconstruct table from positioned words
165
+ let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio, true);
172
166
 
173
167
  if !table_cells.is_empty() {
168
+ // Generate markdown representation
174
169
  let markdown = table_to_markdown(&table_cells);
175
170
 
176
171
  all_tables.push(Table {
177
172
  cells: table_cells,
178
173
  markdown,
179
- page_number: page_index + 1,
174
+ page_number: page_index + 1, // 1-indexed
180
175
  });
181
176
  }
182
177
  }
@@ -188,47 +183,11 @@ fn extract_tables_from_document(
188
183
  #[cfg(all(feature = "pdf", not(feature = "ocr")))]
189
184
  fn extract_tables_from_document(
190
185
  _document: &PdfDocument,
191
- _metadata: &crate::pdf::metadata::PdfExtractionMetadata,
186
+ _metadata: &crate::pdf::metadata::PdfMetadata,
192
187
  ) -> Result<Vec<crate::types::Table>> {
193
188
  Ok(vec![])
194
189
  }
195
190
 
196
- /// Helper function to assign tables and images to pages.
197
- ///
198
- /// If page_contents is None, returns None (no per-page tracking enabled).
199
- /// Otherwise, iterates through tables and images, assigning them to pages based on page_number.
200
- ///
201
- /// # Performance
202
- ///
203
- /// Uses Arc::new to wrap tables and images, avoiding expensive copies.
204
- /// This reduces memory overhead by enabling zero-copy sharing of table/image data
205
- /// across multiple references (e.g., when the same table appears on multiple pages).
206
- fn assign_tables_and_images_to_pages(
207
- mut page_contents: Option<Vec<PageContent>>,
208
- tables: &[crate::types::Table],
209
- images: &[crate::types::ExtractedImage],
210
- ) -> Option<Vec<PageContent>> {
211
- let pages = page_contents.take()?;
212
-
213
- let mut updated_pages = pages;
214
-
215
- for table in tables {
216
- if let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == table.page_number) {
217
- page.tables.push(std::sync::Arc::new(table.clone()));
218
- }
219
- }
220
-
221
- for image in images {
222
- if let Some(page_num) = image.page_number
223
- && let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == page_num)
224
- {
225
- page.images.push(std::sync::Arc::new(image.clone()));
226
- }
227
- }
228
-
229
- Some(updated_pages)
230
- }
231
-
232
191
  /// PDF document extractor using pypdfium2 and playa-pdf.
233
192
  pub struct PdfExtractor;
234
193
 
@@ -243,41 +202,6 @@ impl PdfExtractor {
243
202
  Self
244
203
  }
245
204
 
246
- /// Extract text, metadata, and tables from a PDF document using a single shared instance.
247
- ///
248
- /// This method consolidates all PDF extraction phases (text, metadata, tables) into a single
249
- /// operation using a single PdfDocument instance. This avoids redundant document parsing
250
- /// and pdfium initialization overhead.
251
- ///
252
- /// # Performance
253
- ///
254
- /// By reusing a single document instance across all extraction phases, we eliminate:
255
- /// - Duplicate document parsing overhead (25-40ms saved)
256
- /// - Redundant pdfium bindings initialization
257
- /// - Multiple page tree traversals
258
- ///
259
- /// Expected improvement: 20-30% faster PDF processing.
260
- ///
261
- /// # Returns
262
- ///
263
- /// A tuple containing:
264
- /// - PDF metadata (title, authors, dates, page structure, etc.)
265
- /// - Native extracted text (or empty if using OCR)
266
- /// - Extracted tables (if OCR feature enabled)
267
- /// - Per-page content (if page extraction configured)
268
- #[cfg(feature = "pdf")]
269
- fn extract_all_from_document(
270
- document: &PdfDocument,
271
- config: &ExtractionConfig,
272
- ) -> Result<PdfExtractionPhaseResult> {
273
- let (native_text, _boundaries, page_contents, pdf_metadata) =
274
- crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
275
-
276
- let tables = extract_tables_from_document(document, &pdf_metadata)?;
277
-
278
- Ok((pdf_metadata, native_text, tables, page_contents))
279
- }
280
-
281
205
  /// Extract text from PDF using OCR.
282
206
  ///
283
207
  /// Renders all pages to images and processes them with OCR.
@@ -363,13 +287,6 @@ impl Plugin for PdfExtractor {
363
287
 
364
288
  #[async_trait]
365
289
  impl DocumentExtractor for PdfExtractor {
366
- #[cfg_attr(feature = "otel", tracing::instrument(
367
- skip(self, content, config),
368
- fields(
369
- extractor.name = self.name(),
370
- content.size_bytes = content.len(),
371
- )
372
- ))]
373
290
  async fn extract_bytes(
374
291
  &self,
375
292
  content: &[u8],
@@ -377,26 +294,18 @@ impl DocumentExtractor for PdfExtractor {
377
294
  config: &ExtractionConfig,
378
295
  ) -> Result<ExtractionResult> {
379
296
  #[cfg(feature = "pdf")]
380
- let (pdf_metadata, native_text, tables, page_contents) = {
381
- #[cfg(target_arch = "wasm32")]
382
- {
383
- let pdfium = crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")
384
- .map_err(|pdf_err| {
385
- if pdf_err.to_string().contains("WASM") || pdf_err.to_string().contains("Module") {
386
- crate::error::KreuzbergError::Parsing {
387
- message: "PDF extraction requires proper WASM module initialization. \
388
- Ensure your WASM environment is set up with PDFium support. \
389
- See: https://docs.kreuzberg.dev/wasm/pdf"
390
- .to_string(),
391
- source: None,
392
- }
393
- } else {
394
- pdf_err.into()
395
- }
396
- })?;
397
-
398
- let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
399
- let err_msg = crate::pdf::error::format_pdfium_error(e);
297
+ let (pdf_metadata, native_text, tables) = if crate::core::batch_mode::is_batch_mode() {
298
+ // Batch mode: Move PDF extraction to blocking thread pool to enable parallelism
299
+ let content_owned = content.to_vec();
300
+ tokio::task::spawn_blocking(move || {
301
+ let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
302
+ .or_else(|_| Pdfium::bind_to_system_library())
303
+ .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
304
+
305
+ let pdfium = Pdfium::new(bindings);
306
+
307
+ let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
308
+ let err_msg = e.to_string();
400
309
  if err_msg.contains("password") || err_msg.contains("Password") {
401
310
  PdfError::PasswordRequired
402
311
  } else {
@@ -404,79 +313,40 @@ impl DocumentExtractor for PdfExtractor {
404
313
  }
405
314
  })?;
406
315
 
407
- Self::extract_all_from_document(&document, config)?
408
- }
409
- #[cfg(all(not(target_arch = "wasm32"), feature = "tokio-runtime"))]
410
- {
411
- if crate::core::batch_mode::is_batch_mode() {
412
- let content_owned = content.to_vec();
413
- let span = tracing::Span::current();
414
- let config_owned = config.clone();
415
- tokio::task::spawn_blocking(move || {
416
- let _guard = span.entered();
417
-
418
- let pdfium =
419
- crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
420
-
421
- let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
422
- let err_msg = crate::pdf::error::format_pdfium_error(e);
423
- if err_msg.contains("password") || err_msg.contains("Password") {
424
- PdfError::PasswordRequired
425
- } else {
426
- PdfError::InvalidPdf(err_msg)
427
- }
428
- })?;
429
-
430
- let (pdf_metadata, native_text, tables, page_contents) =
431
- Self::extract_all_from_document(&document, &config_owned)?;
432
-
433
- if let Some(page_cfg) = config_owned.pages.as_ref()
434
- && page_cfg.extract_pages
435
- && page_contents.is_none()
436
- {
437
- return Err(PdfError::ExtractionFailed(
438
- "Page extraction was configured but no page data was extracted in batch mode"
439
- .to_string(),
440
- )
441
- .into());
442
- }
443
-
444
- Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
445
- })
446
- .await
447
- .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
316
+ let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
317
+ let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
318
+
319
+ // Extract tables from native PDF text (when not using OCR)
320
+ let tables = extract_tables_from_document(&document, &metadata)?;
321
+
322
+ Ok::<_, crate::error::KreuzbergError>((metadata, native_text, tables))
323
+ })
324
+ .await
325
+ .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
326
+ } else {
327
+ // Single-file mode: Direct extraction (no spawn overhead)
328
+ let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
329
+ .or_else(|_| Pdfium::bind_to_system_library())
330
+ .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
331
+
332
+ let pdfium = Pdfium::new(bindings);
333
+
334
+ let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
335
+ let err_msg = e.to_string();
336
+ if err_msg.contains("password") || err_msg.contains("Password") {
337
+ PdfError::PasswordRequired
448
338
  } else {
449
- let pdfium =
450
- crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
451
-
452
- let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
453
- let err_msg = crate::pdf::error::format_pdfium_error(e);
454
- if err_msg.contains("password") || err_msg.contains("Password") {
455
- PdfError::PasswordRequired
456
- } else {
457
- PdfError::InvalidPdf(err_msg)
458
- }
459
- })?;
460
-
461
- Self::extract_all_from_document(&document, config)?
339
+ PdfError::InvalidPdf(err_msg)
462
340
  }
463
- }
464
- #[cfg(all(not(target_arch = "wasm32"), not(feature = "tokio-runtime")))]
465
- {
466
- let pdfium =
467
- crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
341
+ })?;
468
342
 
469
- let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
470
- let err_msg = crate::pdf::error::format_pdfium_error(e);
471
- if err_msg.contains("password") || err_msg.contains("Password") {
472
- PdfError::PasswordRequired
473
- } else {
474
- PdfError::InvalidPdf(err_msg)
475
- }
476
- })?;
343
+ let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
344
+ let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
477
345
 
478
- Self::extract_all_from_document(&document, config)?
479
- }
346
+ // Extract tables from native PDF text (when not using OCR)
347
+ let tables = extract_tables_from_document(&document, &metadata)?;
348
+
349
+ (metadata, native_text, tables)
480
350
  };
481
351
 
482
352
  #[cfg(feature = "ocr")]
@@ -487,19 +357,20 @@ impl DocumentExtractor for PdfExtractor {
487
357
  native_text
488
358
  }
489
359
  } else if config.ocr.is_some() {
490
- let decision = evaluate_native_text_for_ocr(&native_text, None);
360
+ let decision = evaluate_native_text_for_ocr(&native_text, pdf_metadata.page_count);
491
361
 
492
362
  if std::env::var("KREUZBERG_DEBUG_OCR").is_ok() {
493
363
  eprintln!(
494
364
  "[kreuzberg::pdf::ocr] fallback={} non_whitespace={} alnum={} meaningful_words={} \
495
- avg_non_whitespace={:.2} avg_alnum={:.2} alnum_ratio={:.3}",
365
+ avg_non_whitespace={:.2} avg_alnum={:.2} alnum_ratio={:.3} pages={}",
496
366
  decision.fallback,
497
367
  decision.stats.non_whitespace,
498
368
  decision.stats.alnum,
499
369
  decision.stats.meaningful_words,
500
370
  decision.avg_non_whitespace,
501
371
  decision.avg_alnum,
502
- decision.stats.alnum_ratio
372
+ decision.stats.alnum_ratio,
373
+ pdf_metadata.page_count.unwrap_or(0)
503
374
  );
504
375
  }
505
376
 
@@ -515,22 +386,7 @@ impl DocumentExtractor for PdfExtractor {
515
386
  #[cfg(not(feature = "ocr"))]
516
387
  let text = native_text;
517
388
 
518
- #[cfg(feature = "pdf")]
519
- if let Some(ref page_cfg) = config.pages
520
- && page_cfg.insert_page_markers
521
- {
522
- let marker_placeholder = page_cfg.marker_format.replace("{page_num}", "");
523
- if !marker_placeholder.is_empty() && !text.contains(&marker_placeholder) {
524
- #[cfg(feature = "otel")]
525
- tracing::warn!(
526
- "Page markers were configured but none found in extracted content. \
527
- This may indicate very short documents or incomplete extraction."
528
- );
529
- }
530
- }
531
-
532
- let images = if config.images.as_ref().map(|c| c.extract_images).unwrap_or(false) {
533
- // Image extraction is enabled, extract images if present
389
+ let images = if config.images.is_some() {
534
390
  match crate::pdf::images::extract_images_from_pdf(content) {
535
391
  Ok(pdf_images) => Some(
536
392
  pdf_images
@@ -554,41 +410,23 @@ impl DocumentExtractor for PdfExtractor {
554
410
  })
555
411
  .collect(),
556
412
  ),
557
- // If extraction fails, return empty vector instead of None
558
- Err(_) => Some(vec![]),
413
+ Err(_) => None,
559
414
  }
560
415
  } else {
561
- // Image extraction is not enabled
562
416
  None
563
417
  };
564
418
 
565
- let final_pages = assign_tables_and_images_to_pages(page_contents, &tables, images.as_deref().unwrap_or(&[]));
419
+ // Tables were extracted during metadata/text extraction phase
420
+ // (see extract_tables_from_document function below)
566
421
 
567
422
  Ok(ExtractionResult {
568
423
  content: text,
569
424
  mime_type: mime_type.to_string(),
570
425
  metadata: Metadata {
571
426
  #[cfg(feature = "pdf")]
572
- title: pdf_metadata.title.clone(),
573
- #[cfg(feature = "pdf")]
574
- subject: pdf_metadata.subject.clone(),
575
- #[cfg(feature = "pdf")]
576
- authors: pdf_metadata.authors.clone(),
577
- #[cfg(feature = "pdf")]
578
- keywords: pdf_metadata.keywords.clone(),
579
- #[cfg(feature = "pdf")]
580
- created_at: pdf_metadata.created_at.clone(),
581
- #[cfg(feature = "pdf")]
582
- modified_at: pdf_metadata.modified_at.clone(),
583
- #[cfg(feature = "pdf")]
584
- created_by: pdf_metadata.created_by.clone(),
585
- #[cfg(feature = "pdf")]
586
- pages: pdf_metadata.page_structure.clone(),
587
- #[cfg(feature = "pdf")]
588
- format: Some(crate::types::FormatMetadata::Pdf(pdf_metadata.pdf_specific)),
427
+ format: Some(crate::types::FormatMetadata::Pdf(pdf_metadata)),
589
428
  ..Default::default()
590
429
  },
591
- pages: final_pages,
592
430
  tables,
593
431
  detected_languages: None,
594
432
  chunks: None,
@@ -596,7 +434,6 @@ impl DocumentExtractor for PdfExtractor {
596
434
  })
597
435
  }
598
436
 
599
- #[cfg(feature = "tokio-runtime")]
600
437
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
601
438
  let bytes = tokio::fs::read(path).await?;
602
439
  self.extract_bytes(&bytes, mime_type, config).await
@@ -656,106 +493,4 @@ mod tests {
656
493
  let sample = " . , ; : -- -- ";
657
494
  assert!(evaluate_native_text_for_ocr(sample, Some(2)).fallback);
658
495
  }
659
-
660
- #[tokio::test]
661
- #[cfg(feature = "pdf")]
662
- async fn test_pdf_batch_mode_validates_page_config_enabled() {
663
- use crate::core::config::PageConfig;
664
-
665
- let extractor = PdfExtractor::new();
666
-
667
- let config = ExtractionConfig {
668
- pages: Some(PageConfig {
669
- extract_pages: true,
670
- insert_page_markers: false,
671
- marker_format: "<!-- PAGE {page_num} -->".to_string(),
672
- }),
673
- ..Default::default()
674
- };
675
-
676
- let pdf_path =
677
- std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/google_doc_document.pdf");
678
- if let Ok(content) = std::fs::read(pdf_path) {
679
- let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
680
- assert!(
681
- result.is_ok(),
682
- "Failed to extract PDF with page config: {:?}",
683
- result.err()
684
- );
685
-
686
- let extraction_result = result.unwrap();
687
- assert!(
688
- extraction_result.pages.is_some(),
689
- "Pages should be extracted when extract_pages is true"
690
- );
691
- }
692
- }
693
-
694
- #[tokio::test]
695
- #[cfg(feature = "pdf")]
696
- async fn test_pdf_batch_mode_validates_page_config_disabled() {
697
- let extractor = PdfExtractor::new();
698
- let config = ExtractionConfig::default();
699
-
700
- let pdf_path =
701
- std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/google_doc_document.pdf");
702
- if let Ok(content) = std::fs::read(pdf_path) {
703
- let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
704
- assert!(
705
- result.is_ok(),
706
- "Failed to extract PDF without page config: {:?}",
707
- result.err()
708
- );
709
-
710
- let extraction_result = result.unwrap();
711
- assert!(
712
- extraction_result.pages.is_none(),
713
- "Pages should not be extracted when pages config is None"
714
- );
715
- }
716
- }
717
-
718
- #[tokio::test]
719
- #[cfg(feature = "pdf")]
720
- async fn test_pdf_page_marker_validation() {
721
- use crate::core::config::PageConfig;
722
-
723
- let extractor = PdfExtractor::new();
724
-
725
- let config = ExtractionConfig {
726
- pages: Some(PageConfig {
727
- extract_pages: true,
728
- insert_page_markers: true,
729
- marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
730
- }),
731
- ..Default::default()
732
- };
733
-
734
- let pdf_path =
735
- std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/multi_page.pdf");
736
- if let Ok(content) = std::fs::read(pdf_path) {
737
- let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
738
- assert!(
739
- result.is_ok(),
740
- "Failed to extract PDF with page markers: {:?}",
741
- result.err()
742
- );
743
-
744
- let extraction_result = result.unwrap();
745
- let marker_placeholder = "<!-- PAGE ";
746
- if extraction_result.content.len() > 100 {
747
- assert!(
748
- extraction_result.content.contains(marker_placeholder),
749
- "Page markers should be inserted when configured and document has multiple pages"
750
- );
751
- }
752
- }
753
- }
754
-
755
- #[test]
756
- #[cfg(feature = "pdf")]
757
- fn test_pdf_extractor_without_feature_pdf() {
758
- let extractor = PdfExtractor::new();
759
- assert_eq!(extractor.name(), "pdf-extractor");
760
- }
761
496
  }