kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,589 +0,0 @@
1
- //! PDF hierarchy quality assessment tests.
2
- //!
3
- //! This module tests PDF text hierarchy extraction quality by comparing against ground truth annotations.
4
- //! Measures precision, recall, F1 score, and level accuracy to ensure the hierarchy detection
5
- //! algorithm works well on real document structures.
6
- //!
7
- //! Test philosophy:
8
- //! - Define ground truth hierarchies for representative PDF documents
9
- //! - Measure how well extracted hierarchies match ground truth
10
- //! - Assert minimum quality thresholds for precision/recall/F1
11
- //! - Verify correct hierarchy level assignments
12
-
13
- #![cfg(feature = "pdf")]
14
-
15
- use kreuzberg::pdf::hierarchy::{
16
- BoundingBox, HierarchyLevel, KMeansResult, TextBlock, assign_hierarchy_levels,
17
- assign_hierarchy_levels_from_clusters, cluster_font_sizes,
18
- };
19
- use serde::{Deserialize, Serialize};
20
- use std::fs;
21
- use std::path::Path;
22
-
23
- /// A bounding box annotation from ground truth.
24
- #[derive(Debug, Clone, Serialize, Deserialize)]
25
- struct GroundTruthBBox {
26
- left: f32,
27
- top: f32,
28
- right: f32,
29
- bottom: f32,
30
- }
31
-
32
- impl GroundTruthBBox {
33
- /// Convert to kreuzberg BoundingBox
34
- fn to_bbox(&self) -> BoundingBox {
35
- BoundingBox {
36
- left: self.left,
37
- top: self.top,
38
- right: self.right,
39
- bottom: self.bottom,
40
- }
41
- }
42
- }
43
-
44
- /// A ground truth text block with hierarchy level annotation.
45
- #[derive(Debug, Clone, Serialize, Deserialize)]
46
- struct GroundTruthBlock {
47
- text: String,
48
- level: String,
49
- bbox: GroundTruthBBox,
50
- }
51
-
52
- /// A page of ground truth annotations.
53
- #[derive(Debug, Clone, Serialize, Deserialize)]
54
- struct GroundTruthPage {
55
- page_number: u32,
56
- blocks: Vec<GroundTruthBlock>,
57
- }
58
-
59
- /// A document with ground truth hierarchy annotations.
60
- #[derive(Debug, Clone, Serialize, Deserialize)]
61
- struct GroundTruthDocument {
62
- pdf_file: String,
63
- pages: Vec<GroundTruthPage>,
64
- }
65
-
66
- /// Root structure for ground truth JSON file.
67
- #[derive(Debug, Clone, Serialize, Deserialize)]
68
- struct GroundTruthFile {
69
- documents: Vec<GroundTruthDocument>,
70
- }
71
-
72
- /// Quality metrics for hierarchy extraction.
73
- #[derive(Debug, Clone, Serialize, Deserialize)]
74
- pub struct QualityMetrics {
75
- /// Precision: (correctly identified hierarchies) / (total extracted hierarchies)
76
- pub precision: f64,
77
- /// Recall: (correctly identified hierarchies) / (total ground truth hierarchies)
78
- pub recall: f64,
79
- /// F1 Score: harmonic mean of precision and recall
80
- pub f1_score: f64,
81
- /// Level accuracy: percentage of blocks assigned to correct hierarchy level
82
- pub level_accuracy: f64,
83
- /// Number of correctly identified hierarchy blocks
84
- pub true_positives: usize,
85
- /// Number of incorrectly extracted hierarchy blocks
86
- pub false_positives: usize,
87
- /// Number of missed ground truth hierarchy blocks
88
- pub false_negatives: usize,
89
- /// Number of blocks with correct hierarchy level
90
- pub correct_levels: usize,
91
- /// Total number of blocks evaluated
92
- pub total_blocks: usize,
93
- }
94
-
95
- impl QualityMetrics {
96
- /// Create new quality metrics from test results.
97
- fn new(
98
- true_positives: usize,
99
- false_positives: usize,
100
- false_negatives: usize,
101
- correct_levels: usize,
102
- total_blocks: usize,
103
- ) -> Self {
104
- let precision = if true_positives + false_positives > 0 {
105
- true_positives as f64 / (true_positives + false_positives) as f64
106
- } else {
107
- 0.0
108
- };
109
-
110
- let recall = if true_positives + false_negatives > 0 {
111
- true_positives as f64 / (true_positives + false_negatives) as f64
112
- } else {
113
- 0.0
114
- };
115
-
116
- let f1_score = if precision + recall > 0.0 {
117
- 2.0 * precision * recall / (precision + recall)
118
- } else {
119
- 0.0
120
- };
121
-
122
- let level_accuracy = if total_blocks > 0 {
123
- correct_levels as f64 / total_blocks as f64
124
- } else {
125
- 0.0
126
- };
127
-
128
- Self {
129
- precision,
130
- recall,
131
- f1_score,
132
- level_accuracy,
133
- true_positives,
134
- false_positives,
135
- false_negatives,
136
- correct_levels,
137
- total_blocks,
138
- }
139
- }
140
- }
141
-
142
- /// Convert hierarchy level string to HierarchyLevel enum.
143
- fn parse_level(level: &str) -> HierarchyLevel {
144
- match level {
145
- "H1" => HierarchyLevel::H1,
146
- "H2" => HierarchyLevel::H2,
147
- "H3" => HierarchyLevel::H3,
148
- "H4" => HierarchyLevel::H4,
149
- "H5" => HierarchyLevel::H5,
150
- "H6" => HierarchyLevel::H6,
151
- _ => HierarchyLevel::Body,
152
- }
153
- }
154
-
155
- /// Load ground truth annotations from JSON file.
156
- ///
157
- /// Reads the hierarchy_ground_truth.json file and parses document annotations.
158
- ///
159
- /// # Arguments
160
- ///
161
- /// * `path` - Path to the ground truth JSON file
162
- ///
163
- /// # Returns
164
- ///
165
- /// Result containing the parsed GroundTruthFile or error message
166
- fn load_ground_truth<P: AsRef<Path>>(path: P) -> Result<GroundTruthFile, String> {
167
- let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
168
- serde_json::from_str(&content).map_err(|e| format!("Failed to parse JSON: {}", e))
169
- }
170
-
171
- /// Calculate quality metrics by comparing extracted hierarchies to ground truth.
172
- ///
173
- /// Compares extracted text blocks with their hierarchy assignments to ground truth annotations.
174
- /// Measures:
175
- /// - Precision: correctly identified hierarchies / total extracted
176
- /// - Recall: correctly identified hierarchies / total ground truth
177
- /// - F1 Score: harmonic mean of precision and recall
178
- /// - Level Accuracy: percentage of blocks with correct hierarchy level
179
- ///
180
- /// # Arguments
181
- ///
182
- /// * `extracted_blocks` - Vector of extracted HierarchyBlock objects
183
- /// * `ground_truth_blocks` - Vector of ground truth blocks
184
- ///
185
- /// # Returns
186
- ///
187
- /// QualityMetrics struct with calculated precision, recall, F1, and level accuracy
188
- fn calculate_quality_metrics(
189
- extracted_blocks: &[kreuzberg::pdf::hierarchy::HierarchyBlock],
190
- ground_truth_blocks: &[GroundTruthBlock],
191
- ) -> QualityMetrics {
192
- let mut true_positives = 0;
193
- let mut false_positives = 0;
194
- let mut correct_levels = 0;
195
-
196
- // For matching blocks, we use bounding box overlap and text similarity
197
- let mut matched_gt_indices: Vec<bool> = vec![false; ground_truth_blocks.len()];
198
-
199
- for extracted in extracted_blocks {
200
- let mut best_match_idx: Option<usize> = None;
201
- let mut best_overlap = 0.0;
202
-
203
- // Find the best matching ground truth block by bounding box overlap
204
- for (gt_idx, gt_block) in ground_truth_blocks.iter().enumerate() {
205
- if matched_gt_indices[gt_idx] {
206
- continue; // Already matched
207
- }
208
-
209
- let gt_bbox = gt_block.bbox.to_bbox();
210
- let overlap = extracted.bbox.iou(&gt_bbox);
211
-
212
- if overlap > best_overlap && overlap > 0.3 {
213
- best_overlap = overlap;
214
- best_match_idx = Some(gt_idx);
215
- }
216
- }
217
-
218
- if let Some(gt_idx) = best_match_idx {
219
- matched_gt_indices[gt_idx] = true;
220
- true_positives += 1;
221
-
222
- // Check if the hierarchy level matches
223
- let gt_level = parse_level(&ground_truth_blocks[gt_idx].level);
224
- if extracted.hierarchy_level == gt_level {
225
- correct_levels += 1;
226
- }
227
- } else {
228
- false_positives += 1;
229
- }
230
- }
231
-
232
- // Count unmatched ground truth blocks as false negatives
233
- let false_negatives = matched_gt_indices.iter().filter(|&&m| !m).count();
234
-
235
- let total_blocks = extracted_blocks.len().max(ground_truth_blocks.len());
236
-
237
- QualityMetrics::new(
238
- true_positives,
239
- false_positives,
240
- false_negatives,
241
- correct_levels,
242
- total_blocks,
243
- )
244
- }
245
-
246
- /// Create test text blocks from ground truth.
247
- fn create_text_blocks_from_ground_truth(gt_blocks: &[GroundTruthBlock]) -> Vec<TextBlock> {
248
- gt_blocks
249
- .iter()
250
- .enumerate()
251
- .map(|(idx, gt_block)| {
252
- // Estimate font size from bbox height
253
- let bbox = gt_block.bbox.to_bbox();
254
- let font_size = match gt_block.level.as_str() {
255
- "H1" => 28.0,
256
- "H2" => 24.0,
257
- "H3" => 20.0,
258
- "H4" => 16.0,
259
- "H5" => 14.0,
260
- "H6" => 12.0,
261
- _ => 10.0, // Body
262
- };
263
-
264
- TextBlock {
265
- text: if gt_block.text.len() > 50 {
266
- format!("{} (Block {})", gt_block.text.chars().take(50).collect::<String>(), idx)
267
- } else {
268
- gt_block.text.clone()
269
- },
270
- bbox,
271
- font_size,
272
- }
273
- })
274
- .collect()
275
- }
276
-
277
- #[test]
278
- fn test_hierarchy_quality_on_ground_truth() {
279
- // Load ground truth data
280
- let ground_truth_path = "tests/data/hierarchy_ground_truth.json";
281
- let ground_truth_file = load_ground_truth(ground_truth_path).expect("Failed to load ground truth file");
282
-
283
- println!(
284
- "\nLoaded {} documents from ground truth",
285
- ground_truth_file.documents.len()
286
- );
287
-
288
- let mut all_metrics: Vec<QualityMetrics> = Vec::new();
289
-
290
- // Process each document
291
- for doc in &ground_truth_file.documents {
292
- println!("\nProcessing document: {}", doc.pdf_file);
293
-
294
- for page in &doc.pages {
295
- println!(" Page {}: {} blocks", page.page_number, page.blocks.len());
296
-
297
- // Create text blocks from ground truth
298
- let text_blocks = create_text_blocks_from_ground_truth(&page.blocks);
299
-
300
- // Cluster by font size
301
- let k = (text_blocks.len() / 3).clamp(1, 6); // Estimate k clusters
302
- let clusters = cluster_font_sizes(&text_blocks, k).expect("Failed to cluster font sizes");
303
-
304
- println!(
305
- " Created {} clusters from {} blocks",
306
- clusters.len(),
307
- text_blocks.len()
308
- );
309
-
310
- // Assign hierarchy levels from clusters
311
- let hierarchy_assignments = assign_hierarchy_levels_from_clusters(&text_blocks, &clusters);
312
-
313
- // Convert to HierarchyBlock format
314
- let extracted_blocks: Vec<kreuzberg::pdf::hierarchy::HierarchyBlock> = hierarchy_assignments
315
- .iter()
316
- .map(|(block, level)| kreuzberg::pdf::hierarchy::HierarchyBlock {
317
- text: block.text.clone(),
318
- bbox: block.bbox,
319
- font_size: block.font_size,
320
- hierarchy_level: *level,
321
- })
322
- .collect();
323
-
324
- // Calculate quality metrics
325
- let metrics = calculate_quality_metrics(&extracted_blocks, &page.blocks);
326
- all_metrics.push(metrics.clone());
327
-
328
- println!(" Precision: {:.4}", metrics.precision);
329
- println!(" Recall: {:.4}", metrics.recall);
330
- println!(" F1 Score: {:.4}", metrics.f1_score);
331
- println!(" Level Accuracy: {:.4}", metrics.level_accuracy);
332
- }
333
- }
334
-
335
- // Calculate average metrics
336
- if !all_metrics.is_empty() {
337
- let avg_precision = all_metrics.iter().map(|m| m.precision).sum::<f64>() / all_metrics.len() as f64;
338
- let avg_recall = all_metrics.iter().map(|m| m.recall).sum::<f64>() / all_metrics.len() as f64;
339
- let avg_f1 = all_metrics.iter().map(|m| m.f1_score).sum::<f64>() / all_metrics.len() as f64;
340
- let avg_level_acc = all_metrics.iter().map(|m| m.level_accuracy).sum::<f64>() / all_metrics.len() as f64;
341
-
342
- println!("\n=== AVERAGE METRICS ACROSS ALL PAGES ===");
343
- println!("Average Precision: {:.4}", avg_precision);
344
- println!("Average Recall: {:.4}", avg_recall);
345
- println!("Average F1 Score: {:.4}", avg_f1);
346
- println!("Average Level Accuracy: {:.4}", avg_level_acc);
347
-
348
- // Assert minimum F1 threshold
349
- assert!(
350
- avg_f1 > 0.85,
351
- "F1 score ({:.4}) must be greater than 0.85. Metrics: precision={:.4}, recall={:.4}, level_accuracy={:.4}",
352
- avg_f1,
353
- avg_precision,
354
- avg_recall,
355
- avg_level_acc
356
- );
357
- }
358
- }
359
-
360
- #[test]
361
- fn test_hierarchy_clustering_consistency() {
362
- // Arrange: Create a simple document with clear hierarchy
363
- let blocks = vec![
364
- TextBlock {
365
- text: "Title".to_string(),
366
- bbox: BoundingBox {
367
- left: 0.0,
368
- top: 0.0,
369
- right: 100.0,
370
- bottom: 28.0,
371
- },
372
- font_size: 28.0,
373
- },
374
- TextBlock {
375
- text: "Subtitle".to_string(),
376
- bbox: BoundingBox {
377
- left: 0.0,
378
- top: 30.0,
379
- right: 100.0,
380
- bottom: 54.0,
381
- },
382
- font_size: 24.0,
383
- },
384
- TextBlock {
385
- text: "Section".to_string(),
386
- bbox: BoundingBox {
387
- left: 0.0,
388
- top: 60.0,
389
- right: 100.0,
390
- bottom: 80.0,
391
- },
392
- font_size: 20.0,
393
- },
394
- TextBlock {
395
- text: "Body paragraph".to_string(),
396
- bbox: BoundingBox {
397
- left: 0.0,
398
- top: 90.0,
399
- right: 100.0,
400
- bottom: 102.0,
401
- },
402
- font_size: 10.0,
403
- },
404
- ];
405
-
406
- // Act: Cluster and assign hierarchies
407
- let clusters = cluster_font_sizes(&blocks, 4).expect("Clustering failed");
408
- let assignments = assign_hierarchy_levels_from_clusters(&blocks, &clusters);
409
-
410
- // Assert: Verify hierarchy levels are correct
411
- assert_eq!(assignments.len(), 4);
412
- assert_eq!(assignments[0].1, HierarchyLevel::H1, "Largest text should be H1");
413
- assert_eq!(assignments[1].1, HierarchyLevel::H2, "Second largest should be H2");
414
- assert_eq!(assignments[2].1, HierarchyLevel::H3, "Third largest should be H3");
415
- assert_eq!(assignments[3].1, HierarchyLevel::Body, "Smallest text should be Body");
416
-
417
- // Assert: F1 score should be perfect for this simple case
418
- let quality_metrics = calculate_quality_metrics(
419
- &assignments
420
- .iter()
421
- .map(|(b, l)| kreuzberg::pdf::hierarchy::HierarchyBlock {
422
- text: b.text.clone(),
423
- bbox: b.bbox,
424
- font_size: b.font_size,
425
- hierarchy_level: *l,
426
- })
427
- .collect::<Vec<_>>(),
428
- &[
429
- GroundTruthBlock {
430
- text: "Title".to_string(),
431
- level: "H1".to_string(),
432
- bbox: GroundTruthBBox {
433
- left: 0.0,
434
- top: 0.0,
435
- right: 100.0,
436
- bottom: 28.0,
437
- },
438
- },
439
- GroundTruthBlock {
440
- text: "Subtitle".to_string(),
441
- level: "H2".to_string(),
442
- bbox: GroundTruthBBox {
443
- left: 0.0,
444
- top: 30.0,
445
- right: 100.0,
446
- bottom: 54.0,
447
- },
448
- },
449
- GroundTruthBlock {
450
- text: "Section".to_string(),
451
- level: "H3".to_string(),
452
- bbox: GroundTruthBBox {
453
- left: 0.0,
454
- top: 60.0,
455
- right: 100.0,
456
- bottom: 80.0,
457
- },
458
- },
459
- GroundTruthBlock {
460
- text: "Body paragraph".to_string(),
461
- level: "Body".to_string(),
462
- bbox: GroundTruthBBox {
463
- left: 0.0,
464
- top: 90.0,
465
- right: 100.0,
466
- bottom: 102.0,
467
- },
468
- },
469
- ],
470
- );
471
-
472
- println!("Consistency Test - F1 Score: {:.4}", quality_metrics.f1_score);
473
- assert!(
474
- quality_metrics.f1_score >= 0.8,
475
- "F1 score for simple hierarchy should be >= 0.8"
476
- );
477
- }
478
-
479
- #[test]
480
- fn test_hierarchy_level_assignment() {
481
- // Arrange: Create blocks and KMeans result
482
- let blocks = vec![
483
- TextBlock {
484
- text: "Main Title".to_string(),
485
- bbox: BoundingBox {
486
- left: 50.0,
487
- top: 50.0,
488
- right: 150.0,
489
- bottom: 100.0,
490
- },
491
- font_size: 28.0,
492
- },
493
- TextBlock {
494
- text: "Section Title".to_string(),
495
- bbox: BoundingBox {
496
- left: 50.0,
497
- top: 120.0,
498
- right: 150.0,
499
- bottom: 160.0,
500
- },
501
- font_size: 20.0,
502
- },
503
- TextBlock {
504
- text: "Regular body text".to_string(),
505
- bbox: BoundingBox {
506
- left: 50.0,
507
- top: 180.0,
508
- right: 200.0,
509
- bottom: 200.0,
510
- },
511
- font_size: 12.0,
512
- },
513
- ];
514
-
515
- let kmeans_result = KMeansResult { labels: vec![0, 1, 2] };
516
-
517
- // Act: Assign hierarchy levels using KMeans result
518
- let result = assign_hierarchy_levels(&blocks, &kmeans_result);
519
-
520
- // Assert: Verify correct level assignments
521
- assert_eq!(result.len(), 3);
522
- assert_eq!(result[0].hierarchy_level, HierarchyLevel::H1);
523
- assert_eq!(result[1].hierarchy_level, HierarchyLevel::H2);
524
- assert_eq!(result[2].hierarchy_level, HierarchyLevel::H3);
525
- }
526
-
527
- #[test]
528
- fn test_quality_metrics_calculation() {
529
- // Arrange: Create extracted blocks and ground truth
530
- let extracted = vec![
531
- kreuzberg::pdf::hierarchy::HierarchyBlock {
532
- text: "Title".to_string(),
533
- bbox: BoundingBox {
534
- left: 0.0,
535
- top: 0.0,
536
- right: 100.0,
537
- bottom: 20.0,
538
- },
539
- font_size: 28.0,
540
- hierarchy_level: HierarchyLevel::H1,
541
- },
542
- kreuzberg::pdf::hierarchy::HierarchyBlock {
543
- text: "Body".to_string(),
544
- bbox: BoundingBox {
545
- left: 0.0,
546
- top: 30.0,
547
- right: 100.0,
548
- bottom: 50.0,
549
- },
550
- font_size: 12.0,
551
- hierarchy_level: HierarchyLevel::Body,
552
- },
553
- ];
554
-
555
- let ground_truth = vec![
556
- GroundTruthBlock {
557
- text: "Title".to_string(),
558
- level: "H1".to_string(),
559
- bbox: GroundTruthBBox {
560
- left: 0.0,
561
- top: 0.0,
562
- right: 100.0,
563
- bottom: 20.0,
564
- },
565
- },
566
- GroundTruthBlock {
567
- text: "Body".to_string(),
568
- level: "Body".to_string(),
569
- bbox: GroundTruthBBox {
570
- left: 0.0,
571
- top: 30.0,
572
- right: 100.0,
573
- bottom: 50.0,
574
- },
575
- },
576
- ];
577
-
578
- // Act: Calculate metrics
579
- let metrics = calculate_quality_metrics(&extracted, &ground_truth);
580
-
581
- // Assert: Verify metrics
582
- assert_eq!(metrics.true_positives, 2);
583
- assert_eq!(metrics.false_positives, 0);
584
- assert_eq!(metrics.false_negatives, 0);
585
- assert_eq!(metrics.correct_levels, 2);
586
- assert!(metrics.precision > 0.99);
587
- assert!(metrics.recall > 0.99);
588
- assert!(metrics.f1_score > 0.99);
589
- }