kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,551 +0,0 @@
1
- //! HTML table parsing tests for `html-to-markdown-rs`.
2
- //!
3
- //! Tests to verify that `html-to-markdown-rs` handles HTML table parsing correctly.
4
- //! These tests help determine if we can safely remove the `scraper` dependency
5
- //! by confirming that `html-to-markdown-rs` already handles table content preservation.
6
-
7
- #[cfg(feature = "html")]
8
- mod html_table_tests {
9
- use kreuzberg::extraction::html::convert_html_to_markdown;
10
-
11
- /// Test basic table HTML to markdown conversion.
12
- ///
13
- /// Verifies that:
14
- /// - Table structure is recognized
15
- /// - Header row (th) content is preserved
16
- /// - Data rows (td) content is preserved
17
- /// - All cell values are retained in output
18
- #[test]
19
- fn test_basic_table_parsing() {
20
- let html = r#"
21
- <table>
22
- <tr>
23
- <th>Name</th>
24
- <th>Age</th>
25
- </tr>
26
- <tr>
27
- <td>Alice</td>
28
- <td>30</td>
29
- </tr>
30
- <tr>
31
- <td>Bob</td>
32
- <td>25</td>
33
- </tr>
34
- </table>
35
- "#;
36
-
37
- let result = convert_html_to_markdown(html, None);
38
- assert!(result.is_ok(), "HTML to markdown conversion should succeed");
39
-
40
- let markdown = result.unwrap();
41
-
42
- println!("=== Basic Table Test ===");
43
- println!("Input HTML:\n{}", html);
44
- println!("\nOutput Markdown:\n{}", markdown);
45
- println!("========================\n");
46
-
47
- assert!(markdown.contains("Name"), "Should contain header 'Name'");
48
- assert!(markdown.contains("Age"), "Should contain header 'Age'");
49
-
50
- assert!(markdown.contains("Alice"), "Should contain cell 'Alice'");
51
- assert!(markdown.contains("Bob"), "Should contain cell 'Bob'");
52
- assert!(markdown.contains("30"), "Should contain cell '30'");
53
- assert!(markdown.contains("25"), "Should contain cell '25'");
54
- }
55
-
56
- /// Test markdown table format output.
57
- ///
58
- /// Verifies that the library outputs proper markdown table syntax
59
- /// with pipe separators and alignment markers.
60
- #[test]
61
- fn test_markdown_table_format() {
62
- let html = r#"
63
- <table>
64
- <thead>
65
- <tr>
66
- <th>Column 1</th>
67
- <th>Column 2</th>
68
- </tr>
69
- </thead>
70
- <tbody>
71
- <tr>
72
- <td>Value 1</td>
73
- <td>Value 2</td>
74
- </tr>
75
- </tbody>
76
- </table>
77
- "#;
78
-
79
- let result = convert_html_to_markdown(html, None);
80
- assert!(result.is_ok(), "Should convert to markdown");
81
-
82
- let markdown = result.unwrap();
83
-
84
- println!("=== Table Format Test ===");
85
- println!("Input HTML:\n{}", html);
86
- println!("\nOutput Markdown:\n{}", markdown);
87
- println!("==========================\n");
88
-
89
- if markdown.contains("|") {
90
- println!("✓ Table uses pipe (|) separators (standard markdown table format)");
91
- assert!(
92
- markdown.contains("Column 1") && markdown.contains("Column 2"),
93
- "Headers should be present in pipe-separated format"
94
- );
95
- } else {
96
- println!("✓ Table content preserved but in alternative format");
97
- assert!(
98
- markdown.contains("Column 1") && markdown.contains("Column 2"),
99
- "Headers should still be present in output"
100
- );
101
- }
102
-
103
- assert!(
104
- markdown.contains("Value 1") && markdown.contains("Value 2"),
105
- "Data should be preserved"
106
- );
107
- }
108
-
109
- /// Test complex table with nested HTML content in cells.
110
- ///
111
- /// Verifies that:
112
- /// - Bold text (strong/b) in cells is handled
113
- /// - Italic text (em/i) in cells is handled
114
- /// - Links in cells are handled
115
- /// - Nested formatting doesn't break table structure
116
- #[test]
117
- fn test_complex_table_with_formatting() {
118
- let html = r#"
119
- <table>
120
- <tr>
121
- <th>Feature</th>
122
- <th>Status</th>
123
- <th>Link</th>
124
- </tr>
125
- <tr>
126
- <td>Headers</td>
127
- <td><strong>Working</strong></td>
128
- <td><a href="https://example.com">docs</a></td>
129
- </tr>
130
- <tr>
131
- <td>Data cells</td>
132
- <td><em>Implemented</em></td>
133
- <td><a href="https://test.com">test</a></td>
134
- </tr>
135
- <tr>
136
- <td><strong>Bold Cell</strong></td>
137
- <td><em>Italic Cell</em></td>
138
- <td><strong><em>Both</em></strong></td>
139
- </tr>
140
- </table>
141
- "#;
142
-
143
- let result = convert_html_to_markdown(html, None);
144
- assert!(result.is_ok(), "Should convert complex table");
145
-
146
- let markdown = result.unwrap();
147
-
148
- println!("=== Complex Table Test ===");
149
- println!("Input HTML:\n{}", html);
150
- println!("\nOutput Markdown:\n{}", markdown);
151
- println!("===========================\n");
152
-
153
- assert!(markdown.contains("Feature"), "Should preserve 'Feature' header");
154
- assert!(markdown.contains("Status"), "Should preserve 'Status' header");
155
- assert!(markdown.contains("Link"), "Should preserve 'Link' header");
156
-
157
- assert!(markdown.contains("Headers"), "Should preserve 'Headers' cell");
158
- assert!(markdown.contains("Data cells"), "Should preserve 'Data cells' cell");
159
-
160
- assert!(
161
- markdown.contains("Working"),
162
- "Should preserve 'Working' (from strong tag)"
163
- );
164
- assert!(
165
- markdown.contains("Implemented"),
166
- "Should preserve 'Implemented' (from em tag)"
167
- );
168
-
169
- assert!(
170
- markdown.contains("docs") || markdown.contains("example.com"),
171
- "Should preserve link content or URL"
172
- );
173
-
174
- println!("✓ All content preserved in complex table");
175
- }
176
-
177
- /// Test table with colspan and rowspan attributes.
178
- ///
179
- /// Verifies how the library handles merged cells.
180
- #[test]
181
- fn test_table_with_merged_cells() {
182
- let html = r#"
183
- <table>
184
- <tr>
185
- <th colspan="2">Merged Header</th>
186
- </tr>
187
- <tr>
188
- <td>Cell 1</td>
189
- <td>Cell 2</td>
190
- </tr>
191
- </table>
192
- "#;
193
-
194
- let result = convert_html_to_markdown(html, None);
195
- assert!(result.is_ok(), "Should handle merged cell table");
196
-
197
- let markdown = result.unwrap();
198
-
199
- println!("=== Merged Cells Test ===");
200
- println!("Input HTML:\n{}", html);
201
- println!("\nOutput Markdown:\n{}", markdown);
202
- println!("==========================\n");
203
-
204
- assert!(
205
- markdown.contains("Merged Header"),
206
- "Should preserve merged header content"
207
- );
208
- assert!(
209
- markdown.contains("Cell 1") && markdown.contains("Cell 2"),
210
- "Should preserve all cell content"
211
- );
212
-
213
- println!("✓ Merged cell content preserved");
214
- }
215
-
216
- /// Test multiple tables in same HTML document.
217
- ///
218
- /// Verifies that the library can handle multiple tables
219
- /// without losing data or mixing them up.
220
- #[test]
221
- fn test_multiple_tables() {
222
- let html = r#"
223
- <h2>First Table</h2>
224
- <table>
225
- <tr>
226
- <th>A</th>
227
- <th>B</th>
228
- </tr>
229
- <tr>
230
- <td>1</td>
231
- <td>2</td>
232
- </tr>
233
- </table>
234
-
235
- <h2>Second Table</h2>
236
- <table>
237
- <tr>
238
- <th>X</th>
239
- <th>Y</th>
240
- </tr>
241
- <tr>
242
- <td>10</td>
243
- <td>20</td>
244
- </tr>
245
- </table>
246
- "#;
247
-
248
- let result = convert_html_to_markdown(html, None);
249
- assert!(result.is_ok(), "Should handle multiple tables");
250
-
251
- let markdown = result.unwrap();
252
-
253
- println!("=== Multiple Tables Test ===");
254
- println!("Input HTML:\n{}", html);
255
- println!("\nOutput Markdown:\n{}", markdown);
256
- println!("==============================\n");
257
-
258
- assert!(markdown.contains("First Table"), "Should preserve first table heading");
259
- assert!(
260
- markdown.contains("Second Table"),
261
- "Should preserve second table heading"
262
- );
263
- assert!(
264
- markdown.contains("A") && markdown.contains("B"),
265
- "Should preserve first table headers"
266
- );
267
- assert!(
268
- markdown.contains("X") && markdown.contains("Y"),
269
- "Should preserve second table headers"
270
- );
271
- assert!(
272
- markdown.contains("1") && markdown.contains("2"),
273
- "Should preserve first table data"
274
- );
275
- assert!(
276
- markdown.contains("10") && markdown.contains("20"),
277
- "Should preserve second table data"
278
- );
279
-
280
- println!("✓ Multiple tables handled correctly");
281
- }
282
-
283
- /// Test table with th in data rows (mixed headers and data).
284
- ///
285
- /// Some HTML tables use th elements in tbody, not just thead.
286
- #[test]
287
- fn test_table_with_mixed_header_cells() {
288
- let html = r#"
289
- <table>
290
- <tr>
291
- <th>Row Header</th>
292
- <td>Data 1</td>
293
- <td>Data 2</td>
294
- </tr>
295
- <tr>
296
- <th>Row Header 2</th>
297
- <td>Data 3</td>
298
- <td>Data 4</td>
299
- </tr>
300
- </table>
301
- "#;
302
-
303
- let result = convert_html_to_markdown(html, None);
304
- assert!(result.is_ok(), "Should handle mixed header cells");
305
-
306
- let markdown = result.unwrap();
307
-
308
- println!("=== Mixed Header Cells Test ===");
309
- println!("Input HTML:\n{}", html);
310
- println!("\nOutput Markdown:\n{}", markdown);
311
- println!("=================================\n");
312
-
313
- assert!(markdown.contains("Row Header"), "Should preserve first row header");
314
- assert!(markdown.contains("Row Header 2"), "Should preserve second row header");
315
- assert!(
316
- markdown.contains("Data 1")
317
- && markdown.contains("Data 2")
318
- && markdown.contains("Data 3")
319
- && markdown.contains("Data 4"),
320
- "Should preserve all data cells"
321
- );
322
-
323
- println!("✓ Mixed header cells preserved");
324
- }
325
-
326
- /// Test table with caption and other structural elements.
327
- ///
328
- /// Verifies that additional table structure elements are handled.
329
- #[test]
330
- fn test_table_with_caption() {
331
- let html = r#"
332
- <table>
333
- <caption>Sales Report 2024</caption>
334
- <tr>
335
- <th>Product</th>
336
- <th>Sales</th>
337
- </tr>
338
- <tr>
339
- <td>Widget A</td>
340
- <td>$1,000</td>
341
- </tr>
342
- <tr>
343
- <td>Widget B</td>
344
- <td>$2,500</td>
345
- </tr>
346
- </table>
347
- "#;
348
-
349
- let result = convert_html_to_markdown(html, None);
350
- assert!(result.is_ok(), "Should handle table with caption");
351
-
352
- let markdown = result.unwrap();
353
-
354
- println!("=== Table with Caption Test ===");
355
- println!("Input HTML:\n{}", html);
356
- println!("\nOutput Markdown:\n{}", markdown);
357
- println!("=================================\n");
358
-
359
- if markdown.contains("Sales Report 2024") {
360
- println!("✓ Caption is preserved in output");
361
- } else {
362
- println!("✓ Caption may be handled separately but content is present");
363
- }
364
-
365
- assert!(
366
- markdown.contains("Product") && markdown.contains("Sales"),
367
- "Should preserve headers"
368
- );
369
- assert!(
370
- markdown.contains("Widget A")
371
- && markdown.contains("Widget B")
372
- && markdown.contains("1,000")
373
- && markdown.contains("2,500"),
374
- "Should preserve all table data"
375
- );
376
- }
377
-
378
- /// Test simple flat table data structure.
379
- ///
380
- /// This is the most common table format and should work reliably.
381
- #[test]
382
- fn test_simple_flat_table() {
383
- let html = r#"<table><tr><td>A</td><td>B</td></tr><tr><td>C</td><td>D</td></tr></table>"#;
384
-
385
- let result = convert_html_to_markdown(html, None);
386
- assert!(result.is_ok(), "Should handle flat table");
387
-
388
- let markdown = result.unwrap();
389
-
390
- println!("=== Simple Flat Table Test ===");
391
- println!("Input HTML:\n{}", html);
392
- println!("\nOutput Markdown:\n{}", markdown);
393
- println!("==============================\n");
394
-
395
- assert!(
396
- markdown.contains("A") && markdown.contains("B") && markdown.contains("C") && markdown.contains("D"),
397
- "Should preserve all cells in flat table"
398
- );
399
-
400
- println!("✓ Flat table structure preserved");
401
- }
402
-
403
- /// Test empty table cells.
404
- ///
405
- /// Verifies handling of tables with empty or whitespace-only cells.
406
- #[test]
407
- fn test_table_with_empty_cells() {
408
- let html = r#"
409
- <table>
410
- <tr>
411
- <td>Data</td>
412
- <td></td>
413
- </tr>
414
- <tr>
415
- <td> </td>
416
- <td>More Data</td>
417
- </tr>
418
- </table>
419
- "#;
420
-
421
- let result = convert_html_to_markdown(html, None);
422
- assert!(result.is_ok(), "Should handle empty cells");
423
-
424
- let markdown = result.unwrap();
425
-
426
- println!("=== Empty Cells Test ===");
427
- println!("Input HTML:\n{}", html);
428
- println!("\nOutput Markdown:\n{}", markdown);
429
- println!("========================\n");
430
-
431
- assert!(markdown.contains("Data"), "Should preserve non-empty cell");
432
- assert!(markdown.contains("More Data"), "Should preserve other non-empty cell");
433
-
434
- println!("✓ Table with empty cells handled");
435
- }
436
-
437
- /// Test table with numeric data.
438
- ///
439
- /// Ensures that numeric content is preserved correctly.
440
- #[test]
441
- fn test_table_with_numeric_data() {
442
- let html = r#"
443
- <table>
444
- <tr>
445
- <th>Value</th>
446
- <th>Amount</th>
447
- </tr>
448
- <tr>
449
- <td>123456</td>
450
- <td>789.45</td>
451
- </tr>
452
- <tr>
453
- <td>999</td>
454
- <td>0.01</td>
455
- </tr>
456
- </table>
457
- "#;
458
-
459
- let result = convert_html_to_markdown(html, None);
460
- assert!(result.is_ok(), "Should handle numeric table");
461
-
462
- let markdown = result.unwrap();
463
-
464
- println!("=== Numeric Data Test ===");
465
- println!("Input HTML:\n{}", html);
466
- println!("\nOutput Markdown:\n{}", markdown);
467
- println!("=========================\n");
468
-
469
- assert!(markdown.contains("123456"), "Should preserve numeric data");
470
- assert!(markdown.contains("789.45"), "Should preserve decimal numbers");
471
- assert!(markdown.contains("0.01"), "Should preserve small decimals");
472
-
473
- println!("✓ Numeric data preserved");
474
- }
475
-
476
- /// Test table with special characters and unicode.
477
- ///
478
- /// Verifies handling of non-ASCII characters in table cells.
479
- #[test]
480
- fn test_table_with_special_characters() {
481
- let html = r#"
482
- <table>
483
- <tr>
484
- <th>Name</th>
485
- <th>Description</th>
486
- </tr>
487
- <tr>
488
- <td>Café</td>
489
- <td>Résumé with accents</td>
490
- </tr>
491
- <tr>
492
- <td>北京</td>
493
- <td>Chinese characters</td>
494
- </tr>
495
- <tr>
496
- <td>Ñoño</td>
497
- <td>Spanish tilde</td>
498
- </tr>
499
- </table>
500
- "#;
501
-
502
- let result = convert_html_to_markdown(html, None);
503
- assert!(result.is_ok(), "Should handle unicode characters");
504
-
505
- let markdown = result.unwrap();
506
-
507
- println!("=== Special Characters Test ===");
508
- println!("Input HTML:\n{}", html);
509
- println!("\nOutput Markdown:\n{}", markdown);
510
- println!("=================================\n");
511
-
512
- assert!(markdown.contains("Café"), "Should preserve accented characters");
513
- assert!(markdown.contains("北京"), "Should preserve Chinese characters");
514
- assert!(markdown.contains("Ñoño"), "Should preserve Spanish tilde");
515
-
516
- println!("✓ Special characters preserved");
517
- }
518
- }
519
-
520
- /// Summary test providing an overall assessment of html-to-markdown-rs capabilities.
521
- ///
522
- /// Run with: cargo test --test html_table_test --features html -- --nocapture --test-threads=1
523
- #[cfg(feature = "html")]
524
- #[test]
525
- fn html_table_support_summary() {
526
- println!("\n");
527
- println!("╔════════════════════════════════════════════════════════════════╗");
528
- println!("║ HTML Table Parsing Support Assessment Summary ║");
529
- println!("╠════════════════════════════════════════════════════════════════╣");
530
- println!("║ Testing html-to-markdown-rs capabilities for table parsing ║");
531
- println!("║ to determine if scraper dependency can be safely removed. ║");
532
- println!("╚════════════════════════════════════════════════════════════════╝");
533
- println!();
534
- println!("Test Results:");
535
- println!(" ✓ Basic table parsing with th/td elements");
536
- println!(" ✓ Markdown table format validation");
537
- println!(" ✓ Complex tables with nested HTML content");
538
- println!(" ✓ Tables with merged cells (colspan/rowspan)");
539
- println!(" ✓ Multiple tables in same document");
540
- println!(" ✓ Mixed header cells within tbody");
541
- println!(" ✓ Tables with caption elements");
542
- println!(" ✓ Simple flat table structures");
543
- println!(" ✓ Empty and whitespace-only cells");
544
- println!(" ✓ Numeric data preservation");
545
- println!(" ✓ Unicode and special characters");
546
- println!();
547
- println!("Assessment:");
548
- println!(" If all tests pass: html-to-markdown-rs is sufficient");
549
- println!(" If content is preserved: scraper dependency may be removable");
550
- println!();
551
- }
@@ -1,139 +0,0 @@
1
- #![cfg(feature = "otel")]
2
-
3
- use std::sync::{Arc, Mutex};
4
- use tracing::Subscriber;
5
- use tracing::span::{Attributes, Id};
6
- use tracing_subscriber::Layer;
7
- use tracing_subscriber::layer::{Context, SubscriberExt};
8
- use tracing_subscriber::registry::LookupSpan;
9
-
10
- /// Simple span name collector for testing.
11
- ///
12
- /// This layer collects span names as they are created to verify
13
- /// that instrumentation is working correctly.
14
- struct SpanCollector {
15
- spans: Arc<Mutex<Vec<String>>>,
16
- }
17
-
18
- impl<S: Subscriber + for<'a> LookupSpan<'a>> Layer<S> for SpanCollector {
19
- fn on_new_span(&self, attrs: &Attributes<'_>, _id: &Id, _ctx: Context<'_, S>) {
20
- self.spans.lock().unwrap().push(attrs.metadata().name().to_string());
21
- }
22
- }
23
-
24
- #[tokio::test]
25
- async fn test_cache_instrumentation() {
26
- use kreuzberg::cache::GenericCache;
27
- use tempfile::tempdir;
28
-
29
- let spans = Arc::new(Mutex::new(Vec::new()));
30
- let collector = SpanCollector { spans: spans.clone() };
31
-
32
- let subscriber = tracing_subscriber::registry().with(collector);
33
- let _guard = tracing::subscriber::set_default(subscriber);
34
-
35
- let temp_dir = tempdir().unwrap();
36
- let cache = GenericCache::new(
37
- "test".to_string(),
38
- Some(temp_dir.path().to_str().unwrap().to_string()),
39
- 30.0,
40
- 500.0,
41
- 1000.0,
42
- )
43
- .unwrap();
44
-
45
- cache.set("test_key", b"test data".to_vec(), None).unwrap();
46
-
47
- let _ = cache.get("test_key", None).unwrap();
48
-
49
- let span_names = spans.lock().unwrap();
50
- assert!(span_names.contains(&"set".to_string()), "Expected 'set' span");
51
- assert!(span_names.contains(&"get".to_string()), "Expected 'get' span");
52
- }
53
-
54
- #[cfg(feature = "ocr")]
55
- #[tokio::test]
56
- async fn test_ocr_instrumentation() {
57
- use kreuzberg::ocr::processor::OcrProcessor;
58
- use kreuzberg::ocr::types::TesseractConfig;
59
- use tempfile::tempdir;
60
-
61
- let spans = Arc::new(Mutex::new(Vec::new()));
62
- let collector = SpanCollector { spans: spans.clone() };
63
-
64
- let subscriber = tracing_subscriber::registry().with(collector);
65
- let _guard = tracing::subscriber::set_default(subscriber);
66
-
67
- let temp_dir = tempdir().unwrap();
68
- let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
69
-
70
- let mut test_image = Vec::new();
71
- let img = image::ImageBuffer::from_fn(1, 1, |_, _| image::Rgb([255u8, 255u8, 255u8]));
72
- img.write_to(&mut std::io::Cursor::new(&mut test_image), image::ImageFormat::Png)
73
- .unwrap();
74
-
75
- let config = TesseractConfig {
76
- output_format: "text".to_string(),
77
- use_cache: false,
78
- ..TesseractConfig::default()
79
- };
80
-
81
- let _ = processor.process_image(&test_image, &config);
82
-
83
- let span_names = spans.lock().unwrap();
84
- assert!(
85
- span_names.contains(&"process_image".to_string()),
86
- "Expected 'process_image' span"
87
- );
88
- }
89
-
90
- #[tokio::test]
91
- async fn test_registry_instrumentation() {
92
- use kreuzberg::plugins::registry::DocumentExtractorRegistry;
93
-
94
- let spans = Arc::new(Mutex::new(Vec::new()));
95
- let collector = SpanCollector { spans: spans.clone() };
96
-
97
- let subscriber = tracing_subscriber::registry().with(collector);
98
- let _guard = tracing::subscriber::set_default(subscriber);
99
-
100
- let registry = DocumentExtractorRegistry::new();
101
-
102
- let _ = registry.get("application/pdf");
103
-
104
- let span_names = spans.lock().unwrap();
105
- assert!(
106
- span_names.contains(&"get".to_string()),
107
- "Expected 'get' span from registry"
108
- );
109
- }
110
-
111
- #[cfg(all(feature = "pdf", feature = "office"))]
112
- #[tokio::test]
113
- async fn test_span_hierarchy() {
114
- use kreuzberg::core::config::ExtractionConfig;
115
- use kreuzberg::core::extractor::extract_bytes;
116
-
117
- let spans = Arc::new(Mutex::new(Vec::new()));
118
- let collector = SpanCollector { spans: spans.clone() };
119
-
120
- let subscriber = tracing_subscriber::registry().with(collector);
121
- let _guard = tracing::subscriber::set_default(subscriber);
122
-
123
- let test_content = b"Hello, World!";
124
- let config = ExtractionConfig::default();
125
-
126
- let _ = extract_bytes(test_content, "text/plain", &config).await;
127
-
128
- let span_names = spans.lock().unwrap();
129
- assert!(
130
- span_names.contains(&"extract_bytes".to_string()),
131
- "Expected 'extract_bytes' span"
132
- );
133
- }
134
-
135
- #[test]
136
- fn test_span_collector_creation() {
137
- let spans = Arc::new(Mutex::new(Vec::new()));
138
- let _collector = SpanCollector { spans };
139
- }