kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -0,0 +1,503 @@
1
+ //! Pandoc integration tests.
2
+ //!
3
+ //! Tests for Pandoc-based document extraction (RST, LaTeX, ODT, RTF).
4
+ //! Validates that Pandoc integration works when available and degrades gracefully when missing.
5
+ //!
6
+ //! Note: These tests require the `office` feature to be enabled.
7
+
8
+ #![cfg(feature = "office")]
9
+
10
+ use kreuzberg::core::config::ExtractionConfig;
11
+ use kreuzberg::core::extractor::extract_bytes;
12
+ use kreuzberg::extraction::pandoc::validate_pandoc_version;
13
+
14
+ mod helpers;
15
+
16
+ /// Check if Pandoc is installed and available.
17
+ async fn is_pandoc_available() -> bool {
18
+ validate_pandoc_version().await.is_ok()
19
+ }
20
+
21
+ /// Test reStructuredText (RST) extraction.
22
+ #[tokio::test]
23
+ async fn test_rst_extraction() {
24
+ if !is_pandoc_available().await {
25
+ println!("Skipping test: Pandoc not installed");
26
+ return;
27
+ }
28
+
29
+ let config = ExtractionConfig::default();
30
+
31
+ let rst_content = b"Title
32
+ =====
33
+
34
+ This is a paragraph in reStructuredText.
35
+
36
+ Section Heading
37
+ ---------------
38
+
39
+ - Bullet point 1
40
+ - Bullet point 2
41
+ - Bullet point 3
42
+
43
+ **Bold text** and *italic text*.";
44
+
45
+ let result = extract_bytes(rst_content, "text/x-rst", &config).await;
46
+
47
+ assert!(result.is_ok(), "RST extraction should succeed");
48
+ let extraction = result.unwrap();
49
+
50
+ assert_eq!(extraction.mime_type, "text/x-rst");
51
+
52
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
53
+ assert!(
54
+ extraction.chunks.is_none(),
55
+ "Chunks should be None without chunking config"
56
+ );
57
+ assert!(
58
+ extraction.detected_languages.is_none(),
59
+ "Language detection not enabled"
60
+ );
61
+ assert!(extraction.tables.is_empty(), "RST should not extract tables");
62
+
63
+ assert!(extraction.content.contains("Title"), "Should extract title");
64
+ assert!(
65
+ extraction.content.contains("paragraph"),
66
+ "Should extract paragraph text"
67
+ );
68
+ assert!(
69
+ extraction.content.contains("Section Heading"),
70
+ "Should extract section heading"
71
+ );
72
+
73
+ assert!(
74
+ extraction.content.contains("Bullet point 1") || extraction.content.contains("point 1"),
75
+ "Should extract bullet points"
76
+ );
77
+
78
+ assert!(
79
+ extraction.content.contains("Bold text") || extraction.content.contains("italic text"),
80
+ "Should extract formatted text content"
81
+ );
82
+
83
+ let content_lower = extraction.content.to_lowercase();
84
+ assert!(content_lower.contains("title"), "Should extract title");
85
+ assert!(content_lower.contains("section"), "Should extract section heading");
86
+ assert!(content_lower.contains("bullet"), "Should extract bullet list");
87
+ }
88
+
89
+ /// Test LaTeX extraction.
90
+ #[tokio::test]
91
+ async fn test_latex_extraction() {
92
+ if !is_pandoc_available().await {
93
+ println!("Skipping test: Pandoc not installed");
94
+ return;
95
+ }
96
+
97
+ let config = ExtractionConfig::default();
98
+
99
+ let latex_content = b"\\documentclass{article}
100
+ \\begin{document}
101
+
102
+ \\title{Test Document}
103
+ \\author{Test Author}
104
+ \\maketitle
105
+
106
+ \\section{Introduction}
107
+
108
+ This is a test LaTeX document with \\textbf{bold} and \\textit{italic} text.
109
+
110
+ \\subsection{Subsection}
111
+
112
+ Some content in a subsection.
113
+
114
+ \\end{document}";
115
+
116
+ let result = extract_bytes(latex_content, "application/x-latex", &config).await;
117
+
118
+ assert!(result.is_ok(), "LaTeX extraction should succeed");
119
+ let extraction = result.unwrap();
120
+
121
+ assert_eq!(extraction.mime_type, "application/x-latex");
122
+
123
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
124
+ assert!(
125
+ extraction.chunks.is_none(),
126
+ "Chunks should be None without chunking config"
127
+ );
128
+ assert!(
129
+ extraction.detected_languages.is_none(),
130
+ "Language detection not enabled"
131
+ );
132
+ assert!(
133
+ extraction.tables.is_empty(),
134
+ "LaTeX should not extract tables in this test"
135
+ );
136
+
137
+ assert!(
138
+ extraction.content.contains("Test Document"),
139
+ "Should extract document title"
140
+ );
141
+
142
+ assert!(
143
+ extraction.content.contains("Introduction"),
144
+ "Should extract section heading"
145
+ );
146
+ assert!(
147
+ extraction.content.contains("Subsection"),
148
+ "Should extract subsection heading"
149
+ );
150
+
151
+ assert!(
152
+ extraction.content.contains("test LaTeX document"),
153
+ "Should extract paragraph text"
154
+ );
155
+
156
+ assert!(
157
+ !extraction.content.contains("\\textbf") && !extraction.content.contains("\\section"),
158
+ "LaTeX commands should be stripped, not included in output"
159
+ );
160
+ }
161
+
162
+ /// Test OpenDocument Text (ODT) extraction.
163
+ #[tokio::test]
164
+ async fn test_odt_extraction() {
165
+ if !is_pandoc_available().await {
166
+ println!("Skipping test: Pandoc not installed");
167
+ return;
168
+ }
169
+
170
+ let config = ExtractionConfig::default();
171
+
172
+ let invalid_odt = b"This is not a valid ODT file";
173
+
174
+ let result = extract_bytes(invalid_odt, "application/vnd.oasis.opendocument.text", &config).await;
175
+
176
+ assert!(result.is_err(), "Invalid ODT should fail gracefully");
177
+
178
+ let error = result.unwrap_err();
179
+ match error {
180
+ kreuzberg::KreuzbergError::Parsing { .. } => {}
181
+ kreuzberg::KreuzbergError::Io(_) => {}
182
+ other => panic!("Expected Parsing or Io error, got: {:?}", other),
183
+ }
184
+ }
185
+
186
+ /// Test Rich Text Format (RTF) extraction.
187
+ #[tokio::test]
188
+ async fn test_rtf_extraction() {
189
+ if !is_pandoc_available().await {
190
+ println!("Skipping test: Pandoc not installed");
191
+ return;
192
+ }
193
+
194
+ let config = ExtractionConfig::default();
195
+
196
+ let rtf_content = b"{\\rtf1\\ansi\\deff0
197
+ {\\fonttbl{\\f0 Times New Roman;}}
198
+ \\f0\\fs24 This is a test RTF document.\\par
199
+ \\b Bold text\\b0 and \\i italic text\\i0.\\par
200
+ }";
201
+
202
+ let result = extract_bytes(rtf_content, "application/rtf", &config).await;
203
+
204
+ assert!(result.is_ok(), "RTF extraction should succeed");
205
+ let extraction = result.unwrap();
206
+
207
+ assert_eq!(extraction.mime_type, "application/rtf");
208
+
209
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
210
+ assert!(
211
+ extraction.chunks.is_none(),
212
+ "Chunks should be None without chunking config"
213
+ );
214
+ assert!(
215
+ extraction.detected_languages.is_none(),
216
+ "Language detection not enabled"
217
+ );
218
+ assert!(
219
+ extraction.tables.is_empty(),
220
+ "RTF should not extract tables in this test"
221
+ );
222
+
223
+ assert!(
224
+ extraction.content.contains("test RTF document"),
225
+ "Should extract main paragraph"
226
+ );
227
+ assert!(
228
+ extraction.content.contains("Bold text") || extraction.content.contains("Bold"),
229
+ "Should extract bold text"
230
+ );
231
+ assert!(
232
+ extraction.content.contains("italic text") || extraction.content.contains("italic"),
233
+ "Should extract italic text"
234
+ );
235
+
236
+ assert!(
237
+ !extraction.content.contains("\\rtf") && !extraction.content.contains("\\par"),
238
+ "RTF control codes should be stripped from output"
239
+ );
240
+ }
241
+
242
+ /// Test graceful degradation when Pandoc is not installed.
243
+ #[tokio::test]
244
+ async fn test_pandoc_not_installed() {
245
+ let validation_result = validate_pandoc_version().await;
246
+
247
+ if validation_result.is_ok() {
248
+ println!("Pandoc is installed - skipping 'not installed' test");
249
+ return;
250
+ }
251
+
252
+ assert!(
253
+ validation_result.is_err(),
254
+ "Should return error when Pandoc not installed"
255
+ );
256
+ }
257
+
258
+ /// Test Pandoc conversion error handling.
259
+ #[tokio::test]
260
+ async fn test_pandoc_conversion_error() {
261
+ if !is_pandoc_available().await {
262
+ println!("Skipping test: Pandoc not installed");
263
+ return;
264
+ }
265
+
266
+ let config = ExtractionConfig::default();
267
+
268
+ let malformed_rst = b"===\nThis is malformed\n===\n===";
269
+
270
+ let result = extract_bytes(malformed_rst, "text/x-rst", &config).await;
271
+
272
+ assert!(
273
+ result.is_ok() || result.is_err(),
274
+ "Should handle malformed content gracefully"
275
+ );
276
+ }
277
+
278
+ /// Test EPUB extraction (ebook format).
279
+ #[tokio::test]
280
+ async fn test_epub_extraction() {
281
+ if !is_pandoc_available().await {
282
+ println!("Skipping test: Pandoc not installed");
283
+ return;
284
+ }
285
+
286
+ let config = ExtractionConfig::default();
287
+
288
+ let invalid_epub = b"This is not a valid EPUB file";
289
+
290
+ let result = extract_bytes(invalid_epub, "application/epub+zip", &config).await;
291
+
292
+ assert!(result.is_err(), "Invalid EPUB should fail gracefully");
293
+
294
+ let error = result.unwrap_err();
295
+ match error {
296
+ kreuzberg::KreuzbergError::Parsing { .. } => {}
297
+ kreuzberg::KreuzbergError::Io(_) => {}
298
+ other => panic!("Expected Parsing or Io error for invalid EPUB, got: {:?}", other),
299
+ }
300
+ }
301
+
302
+ /// Test Org mode extraction.
303
+ #[tokio::test]
304
+ async fn test_org_mode_extraction() {
305
+ if !is_pandoc_available().await {
306
+ println!("Skipping test: Pandoc not installed");
307
+ return;
308
+ }
309
+
310
+ let config = ExtractionConfig::default();
311
+
312
+ let org_content = b"* Top Level Heading
313
+
314
+ This is a paragraph in Org mode.
315
+
316
+ ** Second Level Heading
317
+
318
+ - Item 1
319
+ - Item 2
320
+ - Item 3
321
+
322
+ *bold text* and /italic text/";
323
+
324
+ let result = extract_bytes(org_content, "text/x-org", &config).await;
325
+
326
+ assert!(result.is_ok(), "Org mode extraction should succeed");
327
+ let extraction = result.unwrap();
328
+
329
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
330
+ assert!(
331
+ extraction.chunks.is_none(),
332
+ "Chunks should be None without chunking config"
333
+ );
334
+ assert!(
335
+ extraction.detected_languages.is_none(),
336
+ "Language detection not enabled"
337
+ );
338
+ assert!(
339
+ extraction.tables.is_empty(),
340
+ "Org mode should not extract tables in this test"
341
+ );
342
+
343
+ assert!(
344
+ extraction.content.contains("Top Level") || extraction.content.contains("paragraph"),
345
+ "Org mode content should be extracted"
346
+ );
347
+
348
+ assert!(
349
+ extraction.content.contains("paragraph") || extraction.content.contains("Heading"),
350
+ "Text content should be present"
351
+ );
352
+ }
353
+
354
+ /// Test Typst extraction (new document format).
355
+ #[tokio::test]
356
+ async fn test_typst_extraction() {
357
+ if !is_pandoc_available().await {
358
+ println!("Skipping test: Pandoc not installed");
359
+ return;
360
+ }
361
+
362
+ let config = ExtractionConfig::default();
363
+
364
+ let typst_content = b"= Heading
365
+
366
+ This is a paragraph in Typst.
367
+
368
+ == Subheading
369
+
370
+ #strong[Bold text] and #emph[italic text].";
371
+
372
+ let result = extract_bytes(typst_content, "application/x-typst", &config).await;
373
+
374
+ assert!(
375
+ result.is_ok() || result.is_err(),
376
+ "Should handle Typst gracefully (may not be supported in all Pandoc versions)"
377
+ );
378
+ }
379
+
380
+ /// Test CommonMark extraction.
381
+ #[tokio::test]
382
+ async fn test_commonmark_extraction() {
383
+ if !is_pandoc_available().await {
384
+ println!("Skipping test: Pandoc not installed");
385
+ return;
386
+ }
387
+
388
+ let config = ExtractionConfig::default();
389
+
390
+ let commonmark_content = b"# Heading
391
+
392
+ This is a paragraph in CommonMark.
393
+
394
+ ## Subheading
395
+
396
+ - List item 1
397
+ - List item 2
398
+
399
+ **Bold** and *italic* text.";
400
+
401
+ let result = extract_bytes(commonmark_content, "text/x-commonmark", &config).await;
402
+
403
+ assert!(result.is_ok(), "CommonMark extraction should succeed");
404
+ let extraction = result.unwrap();
405
+
406
+ assert!(!extraction.content.is_empty(), "Content should not be empty");
407
+ assert!(
408
+ extraction.chunks.is_none(),
409
+ "Chunks should be None without chunking config"
410
+ );
411
+ assert!(
412
+ extraction.detected_languages.is_none(),
413
+ "Language detection not enabled"
414
+ );
415
+ assert!(
416
+ extraction.tables.is_empty(),
417
+ "CommonMark should not extract tables in this test"
418
+ );
419
+
420
+ assert!(
421
+ extraction.content.contains("Heading") || extraction.content.contains("paragraph"),
422
+ "CommonMark content should be extracted"
423
+ );
424
+
425
+ let content_lower = extraction.content.to_lowercase();
426
+ assert!(
427
+ content_lower.contains("heading") || content_lower.contains("paragraph"),
428
+ "Should extract text"
429
+ );
430
+ assert!(
431
+ content_lower.contains("list") || content_lower.contains("item"),
432
+ "Should extract list items"
433
+ );
434
+ }
435
+
436
+ /// Test empty content.
437
+ #[tokio::test]
438
+ async fn test_pandoc_empty_content() {
439
+ if !is_pandoc_available().await {
440
+ println!("Skipping test: Pandoc not installed");
441
+ return;
442
+ }
443
+
444
+ let config = ExtractionConfig::default();
445
+
446
+ let empty_rst = b"";
447
+
448
+ let result = extract_bytes(empty_rst, "text/x-rst", &config).await;
449
+
450
+ if let Ok(extraction) = result {
451
+ assert!(
452
+ extraction.content.is_empty() || extraction.content.trim().is_empty(),
453
+ "Empty input should produce empty or minimal output"
454
+ );
455
+ }
456
+ }
457
+
458
+ /// Test Unicode content in Pandoc formats.
459
+ #[tokio::test]
460
+ async fn test_pandoc_unicode_content() {
461
+ if !is_pandoc_available().await {
462
+ println!("Skipping test: Pandoc not installed");
463
+ return;
464
+ }
465
+
466
+ let config = ExtractionConfig::default();
467
+
468
+ let unicode_rst = "Title with Unicode
469
+ ==================
470
+
471
+ This document contains Unicode: 你好世界 🌍 café
472
+
473
+ Section
474
+ -------
475
+
476
+ Arabic: مرحبا
477
+ Emoji: 🎉 ✅ 🚀"
478
+ .as_bytes();
479
+
480
+ let result = extract_bytes(unicode_rst, "text/x-rst", &config).await;
481
+
482
+ assert!(result.is_ok(), "Unicode content should be handled");
483
+ let extraction = result.unwrap();
484
+
485
+ assert!(!extraction.content.is_empty(), "Content should be extracted");
486
+ assert!(
487
+ extraction.chunks.is_none(),
488
+ "Chunks should be None without chunking config"
489
+ );
490
+ assert!(
491
+ extraction.detected_languages.is_none(),
492
+ "Language detection not enabled"
493
+ );
494
+ assert!(
495
+ extraction.tables.is_empty(),
496
+ "RST should not extract tables in this test"
497
+ );
498
+
499
+ assert!(
500
+ extraction.content.len() > 20,
501
+ "Should have substantial extracted content"
502
+ );
503
+ }
@@ -4,8 +4,6 @@
4
4
  //! multi-language E2E generator. This module keeps only the cases that
5
5
  //! exercise Rust-specific failure handling or error propagation.
6
6
 
7
- #![cfg(feature = "pdf")]
8
-
9
7
  mod helpers;
10
8
 
11
9
  use helpers::*;