kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -2,22 +2,17 @@
2
2
  //!
3
3
  //! This module converts pdfium character data to HocrWord format,
4
4
  //! allowing us to reuse the existing table reconstruction logic.
5
- //!
6
- //! Note: Table extraction requires the "ocr" feature and is not available in WASM builds.
7
5
 
8
6
  use super::error::{PdfError, Result};
9
- #[cfg(feature = "ocr")]
10
7
  use crate::ocr::table::HocrWord;
11
8
  use pdfium_render::prelude::*;
12
9
 
13
10
  /// Spacing threshold for word boundary detection (in PDF units).
14
11
  ///
15
12
  /// Characters separated by more than this distance are considered separate words.
16
- #[cfg(feature = "ocr")]
17
13
  const WORD_SPACING_THRESHOLD: f32 = 3.0;
18
14
 
19
15
  /// Minimum word length for table detection (filter out noise).
20
- #[cfg(feature = "ocr")]
21
16
  const MIN_WORD_LENGTH: usize = 1;
22
17
 
23
18
  /// Extract words with positions from PDF page for table detection.
@@ -34,55 +29,37 @@ const MIN_WORD_LENGTH: usize = 1;
34
29
  ///
35
30
  /// Vector of HocrWord objects with text and bounding box information.
36
31
  ///
37
- /// # Note
38
- /// This function requires the "ocr" feature to be enabled. Without it, returns an error.
39
- ///
40
32
  /// # Example
41
33
  ///
42
34
  /// ```rust,no_run
43
- /// # #[cfg(feature = "ocr")]
44
- /// # {
45
35
  /// use kreuzberg::pdf::table::extract_words_from_page;
46
36
  /// use pdfium_render::prelude::*;
47
37
  ///
48
- /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
49
38
  /// let pdfium = Pdfium::default();
50
39
  /// let document = pdfium.load_pdf_from_file("example.pdf", None)?;
51
40
  /// let page = document.pages().get(0)?;
52
41
  /// let words = extract_words_from_page(&page, 90.0)?;
53
- /// # Ok(())
54
- /// # }
55
- /// # }
56
42
  /// ```
57
- #[cfg(feature = "ocr")]
58
43
  pub fn extract_words_from_page(page: &PdfPage, min_confidence: f64) -> Result<Vec<HocrWord>> {
44
+ // Get page dimensions for coordinate system
59
45
  let page_width = page.width().value as i32;
60
46
  let page_height = page.height().value as i32;
61
47
 
48
+ // Get all text from page
62
49
  let page_text = page
63
50
  .text()
64
51
  .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get page text: {}", e)))?;
65
52
 
53
+ // Extract character-level information
66
54
  let chars = page_text.chars();
67
55
 
56
+ // Group characters into words based on spacing
68
57
  let words = group_chars_into_words(chars, page_width, page_height, min_confidence)?;
69
58
 
70
59
  Ok(words)
71
60
  }
72
61
 
73
- /// Fallback implementation when OCR feature is disabled.
74
- ///
75
- /// # Errors
76
- /// Always returns an error indicating that the OCR feature is required.
77
- #[cfg(not(feature = "ocr"))]
78
- pub fn extract_words_from_page(_page: &PdfPage, _min_confidence: f64) -> Result<Vec<()>> {
79
- Err(PdfError::TextExtractionFailed(
80
- "PDF table extraction requires the 'ocr' feature to be enabled".to_string(),
81
- ))
82
- }
83
-
84
62
  /// Character with position information extracted from PDF.
85
- #[cfg(feature = "ocr")]
86
63
  #[derive(Debug, Clone)]
87
64
  struct CharInfo {
88
65
  text: char,
@@ -104,7 +81,6 @@ struct CharInfo {
104
81
  /// * `page_width` - Page width in PDF units
105
82
  /// * `page_height` - Page height in PDF units
106
83
  /// * `min_confidence` - Minimum confidence threshold (PDF text uses 95.0)
107
- #[cfg(feature = "ocr")]
108
84
  fn group_chars_into_words(
109
85
  chars: PdfPageTextChars,
110
86
  _page_width: i32,
@@ -115,22 +91,26 @@ fn group_chars_into_words(
115
91
  let mut current_word_chars: Vec<CharInfo> = Vec::new();
116
92
 
117
93
  for pdf_char in chars.iter() {
94
+ // Get character bounds (use loose_bounds for table detection)
118
95
  let bounds = pdf_char
119
96
  .loose_bounds()
120
97
  .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get char bounds: {}", e)))?;
121
98
 
99
+ // Get unicode character (skip if invalid)
122
100
  let Some(ch) = pdf_char.unicode_char() else {
123
101
  continue;
124
102
  };
125
103
 
104
+ // Extract character information
126
105
  let char_info = CharInfo {
127
106
  text: ch,
128
107
  x: bounds.left().value,
129
- y: bounds.bottom().value,
108
+ y: bounds.bottom().value, // PDF coordinates: bottom-left origin
130
109
  width: bounds.width().value,
131
110
  height: bounds.height().value,
132
111
  };
133
112
 
113
+ // Skip whitespace characters (they're used for word boundaries)
134
114
  if char_info.text.is_whitespace() {
135
115
  if !current_word_chars.is_empty() {
136
116
  if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
@@ -141,6 +121,7 @@ fn group_chars_into_words(
141
121
  continue;
142
122
  }
143
123
 
124
+ // Check if this character should start a new word
144
125
  if should_start_new_word(&current_word_chars, &char_info) && !current_word_chars.is_empty() {
145
126
  if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
146
127
  words.push(word);
@@ -151,10 +132,11 @@ fn group_chars_into_words(
151
132
  current_word_chars.push(char_info);
152
133
  }
153
134
 
154
- if !current_word_chars.is_empty()
155
- && let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence)
156
- {
157
- words.push(word);
135
+ // Finalize last word
136
+ if !current_word_chars.is_empty() {
137
+ if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
138
+ words.push(word);
139
+ }
158
140
  }
159
141
 
160
142
  Ok(words)
@@ -164,7 +146,6 @@ fn group_chars_into_words(
164
146
  ///
165
147
  /// Returns true if the character is far from the previous character
166
148
  /// (indicating a word boundary) or on a different line.
167
- #[cfg(feature = "ocr")]
168
149
  fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -> bool {
169
150
  if current_word_chars.is_empty() {
170
151
  return false;
@@ -172,11 +153,13 @@ fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -
172
153
 
173
154
  let last_char = &current_word_chars[current_word_chars.len() - 1];
174
155
 
156
+ // Check vertical distance (different lines)
175
157
  let vertical_distance = (new_char.y - last_char.y).abs();
176
158
  if vertical_distance > last_char.height * 0.5 {
177
159
  return true;
178
160
  }
179
161
 
162
+ // Check horizontal distance (word spacing)
180
163
  let horizontal_gap = new_char.x - (last_char.x + last_char.width);
181
164
  horizontal_gap > WORD_SPACING_THRESHOLD
182
165
  }
@@ -185,43 +168,51 @@ fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -
185
168
  ///
186
169
  /// Calculates bounding box and confidence for the word.
187
170
  /// Returns None if the word doesn't meet minimum criteria.
188
- #[cfg(feature = "ocr")]
189
171
  fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> Option<HocrWord> {
190
172
  if chars.is_empty() {
191
173
  return None;
192
174
  }
193
175
 
176
+ // Build word text
194
177
  let text: String = chars.iter().map(|c| c.text).collect();
195
178
 
196
179
  if text.len() < MIN_WORD_LENGTH {
197
180
  return None;
198
181
  }
199
182
 
200
- let (left, right, bottom, top) = chars.iter().fold(
201
- (f32::INFINITY, f32::NEG_INFINITY, f32::INFINITY, f32::NEG_INFINITY),
202
- |(left, right, bottom, top), c| {
203
- (
204
- left.min(c.x),
205
- right.max(c.x + c.width),
206
- bottom.min(c.y),
207
- top.max(c.y + c.height),
208
- )
209
- },
210
- );
211
-
212
- let (left, right, bottom, top) = if left.is_infinite() {
213
- (0.0, 0.0, 0.0, 0.0)
214
- } else {
215
- (left, right, bottom, top)
216
- };
183
+ // Calculate bounding box (encompassing all characters)
184
+ let left = chars
185
+ .iter()
186
+ .map(|c| c.x)
187
+ .min_by(|a, b| a.partial_cmp(b).unwrap())
188
+ .unwrap_or(0.0);
189
+ let right = chars
190
+ .iter()
191
+ .map(|c| c.x + c.width)
192
+ .max_by(|a, b| a.partial_cmp(b).unwrap())
193
+ .unwrap_or(0.0);
194
+ let bottom = chars
195
+ .iter()
196
+ .map(|c| c.y)
197
+ .min_by(|a, b| a.partial_cmp(b).unwrap())
198
+ .unwrap_or(0.0);
199
+ let top = chars
200
+ .iter()
201
+ .map(|c| c.y + c.height)
202
+ .max_by(|a, b| a.partial_cmp(b).unwrap())
203
+ .unwrap_or(0.0);
217
204
 
218
205
  let width = (right - left).round() as i32;
219
206
  let height = (top - bottom).round() as i32;
220
207
 
208
+ // Convert PDF coordinates (bottom-left origin) to image coordinates (top-left origin)
209
+ // HocrWord expects top-left origin like images/OCR output
221
210
  let top_in_image_coords = (page_height as f32 - top).round() as i32;
222
211
 
212
+ // PDF text has high confidence (no OCR uncertainty)
223
213
  let confidence = 95.0;
224
214
 
215
+ // Apply confidence threshold
225
216
  if confidence < min_confidence {
226
217
  return None;
227
218
  }
@@ -236,7 +227,7 @@ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> O
236
227
  })
237
228
  }
238
229
 
239
- #[cfg(all(test, feature = "ocr"))]
230
+ #[cfg(test)]
240
231
  mod tests {
241
232
  use super::*;
242
233
 
@@ -279,18 +270,20 @@ mod tests {
279
270
  height: 12.0,
280
271
  }];
281
272
 
273
+ // Close character - same word
282
274
  let close_char = CharInfo {
283
275
  text: 'B',
284
- x: 111.0,
276
+ x: 111.0, // 1 unit gap
285
277
  y: 50.0,
286
278
  width: 10.0,
287
279
  height: 12.0,
288
280
  };
289
281
  assert!(!should_start_new_word(&chars, &close_char));
290
282
 
283
+ // Far character - new word
291
284
  let far_char = CharInfo {
292
285
  text: 'C',
293
- x: 120.0,
286
+ x: 120.0, // 10 unit gap (> WORD_SPACING_THRESHOLD)
294
287
  y: 50.0,
295
288
  width: 10.0,
296
289
  height: 12.0,
@@ -308,10 +301,11 @@ mod tests {
308
301
  height: 12.0,
309
302
  }];
310
303
 
304
+ // Character on different line
311
305
  let new_line_char = CharInfo {
312
306
  text: 'B',
313
307
  x: 100.0,
314
- y: 70.0,
308
+ y: 70.0, // Different y
315
309
  width: 10.0,
316
310
  height: 12.0,
317
311
  };
@@ -342,7 +336,7 @@ mod tests {
342
336
 
343
337
  assert_eq!(word.text, "Hi");
344
338
  assert_eq!(word.left, 100);
345
- assert_eq!(word.width, 18);
339
+ assert_eq!(word.width, 18); // 110 + 8 - 100
346
340
  assert_eq!(word.height, 12);
347
341
  assert_eq!(word.confidence, 95.0);
348
342
  }
@@ -364,19 +358,22 @@ mod tests {
364
358
  height: 12.0,
365
359
  }];
366
360
 
361
+ // Low threshold - should pass
367
362
  let word = finalize_word(&chars, 800, 90.0);
368
363
  assert!(word.is_some());
369
364
 
365
+ // High threshold - should fail
370
366
  let word = finalize_word(&chars, 800, 96.0);
371
367
  assert!(word.is_none());
372
368
  }
373
369
 
374
370
  #[test]
375
371
  fn test_coordinate_conversion() {
372
+ // Test PDF coordinate (bottom-left origin) to image coordinate (top-left origin)
376
373
  let chars = vec![CharInfo {
377
374
  text: 'A',
378
375
  x: 100.0,
379
- y: 700.0,
376
+ y: 700.0, // PDF coordinates: bottom-left origin
380
377
  width: 10.0,
381
378
  height: 12.0,
382
379
  }];
@@ -384,11 +381,13 @@ mod tests {
384
381
  let page_height = 800;
385
382
  let word = finalize_word(&chars, page_height, 0.0).unwrap();
386
383
 
384
+ // top_in_image_coords = page_height - (y + height) = 800 - (700 + 12) = 88
387
385
  assert_eq!(word.top, 88);
388
386
  }
389
387
 
390
388
  #[test]
391
389
  fn test_word_bounding_box() {
390
+ // Test that bounding box encompasses all characters
392
391
  let chars = vec![
393
392
  CharInfo {
394
393
  text: 'A',
@@ -400,18 +399,22 @@ mod tests {
400
399
  CharInfo {
401
400
  text: 'B',
402
401
  x: 110.0,
403
- y: 51.0,
402
+ y: 51.0, // Slightly different y
404
403
  width: 10.0,
405
- height: 13.0,
404
+ height: 13.0, // Slightly different height
406
405
  },
407
406
  ];
408
407
 
409
408
  let word = finalize_word(&chars, 800, 0.0).unwrap();
410
409
 
410
+ // Left should be minimum x
411
411
  assert_eq!(word.left, 100);
412
412
 
413
- assert_eq!(word.width, 20);
413
+ // Width should span from leftmost to rightmost character
414
+ assert_eq!(word.width, 20); // 120 - 100
414
415
 
416
+ // Height should encompass both characters
417
+ // max(y+height) - min(y) = max(51+13, 50+12) - 50 = 64 - 50 = 14
415
418
  assert_eq!(word.height, 14);
416
419
  }
417
420
  }