kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,628 +0,0 @@
1
- #![cfg(all(feature = "tokio-runtime", feature = "office"))]
2
-
3
- //! ODT (OpenDocument Text) extractor using native Rust parsing.
4
- //!
5
- //! Supports: OpenDocument Text (.odt)
6
-
7
- use crate::Result;
8
- use crate::core::config::ExtractionConfig;
9
- use crate::extraction::{cells_to_markdown, office_metadata};
10
- use crate::plugins::{DocumentExtractor, Plugin};
11
- use crate::types::{ExtractionResult, Metadata, Table};
12
- use async_trait::async_trait;
13
- use roxmltree::Document;
14
- use std::io::Cursor;
15
-
16
- /// High-performance ODT extractor using native Rust XML parsing.
17
- ///
18
- /// This extractor provides:
19
- /// - Fast text extraction via roxmltree XML parsing
20
- /// - Comprehensive metadata extraction from meta.xml
21
- /// - Table extraction with row and cell support
22
- /// - Formatting preservation (bold, italic, strikeout)
23
- /// - Support for headings, paragraphs, and special elements
24
- pub struct OdtExtractor;
25
-
26
- impl OdtExtractor {
27
- /// Create a new ODT extractor.
28
- pub fn new() -> Self {
29
- Self
30
- }
31
- }
32
-
33
- impl Default for OdtExtractor {
34
- fn default() -> Self {
35
- Self::new()
36
- }
37
- }
38
-
39
- impl Plugin for OdtExtractor {
40
- fn name(&self) -> &str {
41
- "odt-extractor"
42
- }
43
-
44
- fn version(&self) -> String {
45
- env!("CARGO_PKG_VERSION").to_string()
46
- }
47
-
48
- fn initialize(&self) -> Result<()> {
49
- Ok(())
50
- }
51
-
52
- fn shutdown(&self) -> Result<()> {
53
- Ok(())
54
- }
55
-
56
- fn description(&self) -> &str {
57
- "Native Rust ODT (OpenDocument Text) extractor with metadata and table support"
58
- }
59
-
60
- fn author(&self) -> &str {
61
- "Kreuzberg Team"
62
- }
63
- }
64
-
65
- /// Extract text from MathML formula element
66
- ///
67
- /// # Arguments
68
- /// * `math_node` - The math XML node
69
- ///
70
- /// # Returns
71
- /// * `Option<String>` - The extracted formula text
72
- fn extract_mathml_text(math_node: roxmltree::Node) -> Option<String> {
73
- for node in math_node.descendants() {
74
- if node.tag_name().name() == "annotation"
75
- && let Some(encoding) = node.attribute("encoding")
76
- && encoding.contains("StarMath")
77
- && let Some(text) = node.text()
78
- {
79
- return Some(text.to_string());
80
- }
81
- }
82
-
83
- let mut formula_parts = Vec::new();
84
- for node in math_node.descendants() {
85
- match node.tag_name().name() {
86
- "mi" | "mo" | "mn" | "ms" | "mtext" => {
87
- if let Some(text) = node.text() {
88
- formula_parts.push(text.to_string());
89
- }
90
- }
91
- _ => {}
92
- }
93
- }
94
-
95
- if !formula_parts.is_empty() {
96
- Some(formula_parts.join(" "))
97
- } else {
98
- None
99
- }
100
- }
101
-
102
- /// Extract text from embedded formula objects
103
- ///
104
- /// # Arguments
105
- /// * `archive` - ZIP archive containing the ODT document
106
- ///
107
- /// # Returns
108
- /// * `String` - Extracted formula content from embedded objects
109
- fn extract_embedded_formulas(archive: &mut zip::ZipArchive<Cursor<Vec<u8>>>) -> crate::error::Result<String> {
110
- use std::io::Read;
111
- let mut formula_parts = Vec::new();
112
-
113
- let file_names: Vec<String> = archive.file_names().map(|s| s.to_string()).collect();
114
-
115
- for file_name in file_names {
116
- if file_name.contains("Object")
117
- && file_name.ends_with("content.xml")
118
- && let Ok(mut file) = archive.by_name(&file_name)
119
- {
120
- let mut xml_content = String::new();
121
- if file.read_to_string(&mut xml_content).is_ok()
122
- && let Ok(doc) = Document::parse(&xml_content)
123
- {
124
- let root = doc.root_element();
125
-
126
- if root.tag_name().name() == "math" {
127
- if let Some(formula_text) = extract_mathml_text(root) {
128
- formula_parts.push(formula_text);
129
- }
130
- } else {
131
- for node in root.descendants() {
132
- if node.tag_name().name() == "math"
133
- && let Some(formula_text) = extract_mathml_text(node)
134
- {
135
- formula_parts.push(formula_text);
136
- }
137
- }
138
- }
139
- }
140
- }
141
- }
142
-
143
- Ok(formula_parts.join("\n"))
144
- }
145
-
146
- /// Extract text content from ODT content.xml
147
- ///
148
- /// # Arguments
149
- /// * `archive` - ZIP archive containing the ODT document
150
- ///
151
- /// # Returns
152
- /// * `String` - Extracted text content
153
- fn extract_content_text(archive: &mut zip::ZipArchive<Cursor<Vec<u8>>>) -> crate::error::Result<String> {
154
- let mut xml_content = String::new();
155
-
156
- match archive.by_name("content.xml") {
157
- Ok(mut file) => {
158
- use std::io::Read;
159
- file.read_to_string(&mut xml_content)
160
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to read content.xml: {}", e)))?;
161
- }
162
- Err(_) => {
163
- return Ok(String::new());
164
- }
165
- }
166
-
167
- let doc = Document::parse(&xml_content)
168
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse content.xml: {}", e)))?;
169
-
170
- let root = doc.root_element();
171
-
172
- let mut text_parts: Vec<String> = Vec::new();
173
-
174
- for body_child in root.children() {
175
- if body_child.tag_name().name() == "body" {
176
- for text_elem in body_child.children() {
177
- if text_elem.tag_name().name() == "text" {
178
- process_document_elements(text_elem, &mut text_parts);
179
- }
180
- }
181
- }
182
- }
183
-
184
- Ok(text_parts.join("\n").trim().to_string())
185
- }
186
-
187
- /// Helper function to process document elements (paragraphs, headings, tables)
188
- /// Only processes direct children, avoiding nested content like table cells
189
- fn process_document_elements(parent: roxmltree::Node, text_parts: &mut Vec<String>) {
190
- for node in parent.children() {
191
- match node.tag_name().name() {
192
- "h" => {
193
- if let Some(text) = extract_node_text(node)
194
- && !text.trim().is_empty()
195
- {
196
- text_parts.push(format!("# {}", text.trim()));
197
- text_parts.push(String::new());
198
- }
199
- }
200
- "p" => {
201
- if let Some(text) = extract_node_text(node)
202
- && !text.trim().is_empty()
203
- {
204
- text_parts.push(text.trim().to_string());
205
- text_parts.push(String::new());
206
- }
207
- }
208
- "table" => {
209
- if let Some(table_text) = extract_table_text(node) {
210
- text_parts.push(table_text);
211
- text_parts.push(String::new());
212
- }
213
- }
214
- _ => {}
215
- }
216
- }
217
- }
218
-
219
- /// Extract text from a single XML node, handling spans and formatting
220
- ///
221
- /// # Arguments
222
- /// * `node` - The XML node to extract text from
223
- ///
224
- /// # Returns
225
- /// * `Option<String>` - The extracted text with formatting preserved
226
- fn extract_node_text(node: roxmltree::Node) -> Option<String> {
227
- let mut text_parts = Vec::new();
228
-
229
- for child in node.children() {
230
- match child.tag_name().name() {
231
- "span" => {
232
- if let Some(text) = child.text() {
233
- text_parts.push(text.to_string());
234
- }
235
- }
236
- "tab" => {
237
- text_parts.push("\t".to_string());
238
- }
239
- "line-break" => {
240
- text_parts.push("\n".to_string());
241
- }
242
- _ => {
243
- if let Some(text) = child.text() {
244
- text_parts.push(text.to_string());
245
- }
246
- }
247
- }
248
- }
249
-
250
- if text_parts.is_empty() {
251
- node.text().map(|s| s.to_string())
252
- } else {
253
- Some(text_parts.join(""))
254
- }
255
- }
256
-
257
- /// Extract table content as text with markdown formatting
258
- ///
259
- /// # Arguments
260
- /// * `table_node` - The table XML node
261
- ///
262
- /// # Returns
263
- /// * `Option<String>` - Markdown formatted table
264
- fn extract_table_text(table_node: roxmltree::Node) -> Option<String> {
265
- let mut rows = Vec::new();
266
- let mut max_cols = 0;
267
-
268
- for row_node in table_node.children() {
269
- if row_node.tag_name().name() == "table-row" {
270
- let mut row_cells = Vec::new();
271
-
272
- for cell_node in row_node.children() {
273
- if cell_node.tag_name().name() == "table-cell" {
274
- let cell_text = extract_node_text(cell_node).unwrap_or_default();
275
- row_cells.push(cell_text.trim().to_string());
276
- }
277
- }
278
-
279
- if !row_cells.is_empty() {
280
- max_cols = max_cols.max(row_cells.len());
281
- rows.push(row_cells);
282
- }
283
- }
284
- }
285
-
286
- if rows.is_empty() {
287
- return None;
288
- }
289
-
290
- for row in &mut rows {
291
- while row.len() < max_cols {
292
- row.push(String::new());
293
- }
294
- }
295
-
296
- let mut markdown = String::new();
297
-
298
- if !rows.is_empty() {
299
- markdown.push('|');
300
- for cell in &rows[0] {
301
- markdown.push(' ');
302
- markdown.push_str(cell);
303
- markdown.push_str(" |");
304
- }
305
- markdown.push('\n');
306
-
307
- markdown.push('|');
308
- for _ in 0..rows[0].len() {
309
- markdown.push_str(" --- |");
310
- }
311
- markdown.push('\n');
312
-
313
- for row in rows.iter().skip(1) {
314
- markdown.push('|');
315
- for cell in row {
316
- markdown.push(' ');
317
- markdown.push_str(cell);
318
- markdown.push_str(" |");
319
- }
320
- markdown.push('\n');
321
- }
322
- }
323
-
324
- Some(markdown)
325
- }
326
-
327
- /// Extract tables from ODT content.xml
328
- ///
329
- /// # Arguments
330
- /// * `archive` - ZIP archive containing the ODT document
331
- ///
332
- /// # Returns
333
- /// * `Result<Vec<Table>>` - Extracted tables
334
- fn extract_tables(archive: &mut zip::ZipArchive<Cursor<Vec<u8>>>) -> crate::error::Result<Vec<Table>> {
335
- let mut xml_content = String::new();
336
-
337
- match archive.by_name("content.xml") {
338
- Ok(mut file) => {
339
- use std::io::Read;
340
- file.read_to_string(&mut xml_content)
341
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to read content.xml: {}", e)))?;
342
- }
343
- Err(_) => {
344
- return Ok(Vec::new());
345
- }
346
- }
347
-
348
- let doc = Document::parse(&xml_content)
349
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse content.xml: {}", e)))?;
350
-
351
- let root = doc.root_element();
352
- let mut tables = Vec::new();
353
- let mut table_index = 0;
354
-
355
- for node in root.descendants() {
356
- if node.tag_name().name() == "table"
357
- && let Some(table) = parse_odt_table(node, table_index)
358
- {
359
- tables.push(table);
360
- table_index += 1;
361
- }
362
- }
363
-
364
- Ok(tables)
365
- }
366
-
367
- /// Parse a single ODT table element into a Table struct
368
- ///
369
- /// # Arguments
370
- /// * `table_node` - The table XML node
371
- /// * `table_index` - Index of the table in the document
372
- ///
373
- /// # Returns
374
- /// * `Option<Table>` - Parsed table
375
- fn parse_odt_table(table_node: roxmltree::Node, table_index: usize) -> Option<Table> {
376
- let mut cells: Vec<Vec<String>> = Vec::new();
377
-
378
- for row_node in table_node.children() {
379
- if row_node.tag_name().name() == "table-row" {
380
- let mut row_cells = Vec::new();
381
-
382
- for cell_node in row_node.children() {
383
- if cell_node.tag_name().name() == "table-cell" {
384
- let cell_text = extract_node_text(cell_node).unwrap_or_default();
385
- row_cells.push(cell_text.trim().to_string());
386
- }
387
- }
388
-
389
- if !row_cells.is_empty() {
390
- cells.push(row_cells);
391
- }
392
- }
393
- }
394
-
395
- if cells.is_empty() {
396
- return None;
397
- }
398
-
399
- let markdown = cells_to_markdown(&cells);
400
-
401
- Some(Table {
402
- cells,
403
- markdown,
404
- page_number: table_index + 1,
405
- })
406
- }
407
-
408
- #[async_trait]
409
- impl DocumentExtractor for OdtExtractor {
410
- #[cfg_attr(
411
- feature = "otel",
412
- tracing::instrument(
413
- skip(self, content, _config),
414
- fields(
415
- extractor.name = self.name(),
416
- content.size_bytes = content.len(),
417
- )
418
- )
419
- )]
420
- async fn extract_bytes(
421
- &self,
422
- content: &[u8],
423
- mime_type: &str,
424
- _config: &ExtractionConfig,
425
- ) -> Result<ExtractionResult> {
426
- let content_owned = content.to_vec();
427
-
428
- let (text, tables) = if crate::core::batch_mode::is_batch_mode() {
429
- let content_for_task = content_owned.clone();
430
- let span = tracing::Span::current();
431
- tokio::task::spawn_blocking(move || -> crate::error::Result<(String, Vec<Table>)> {
432
- let _guard = span.entered();
433
-
434
- let cursor = Cursor::new(content_for_task);
435
- let mut archive = zip::ZipArchive::new(cursor)
436
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?;
437
-
438
- let text = extract_content_text(&mut archive)?;
439
- let tables = extract_tables(&mut archive)?;
440
- let embedded_formulas = extract_embedded_formulas(&mut archive)?;
441
-
442
- let combined_text = if !embedded_formulas.is_empty() {
443
- if !text.is_empty() {
444
- format!("{}\n{}", text, embedded_formulas)
445
- } else {
446
- embedded_formulas
447
- }
448
- } else {
449
- text
450
- };
451
-
452
- Ok((combined_text, tables))
453
- })
454
- .await
455
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("ODT extraction task failed: {}", e)))??
456
- } else {
457
- let cursor = Cursor::new(content_owned.clone());
458
- let mut archive = zip::ZipArchive::new(cursor)
459
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?;
460
-
461
- let text = extract_content_text(&mut archive)?;
462
- let tables = extract_tables(&mut archive)?;
463
- let embedded_formulas = extract_embedded_formulas(&mut archive)?;
464
-
465
- let combined_text = if !embedded_formulas.is_empty() {
466
- if !text.is_empty() {
467
- format!("{}\n{}", text, embedded_formulas)
468
- } else {
469
- embedded_formulas
470
- }
471
- } else {
472
- text
473
- };
474
-
475
- (combined_text, tables)
476
- };
477
-
478
- let mut metadata_map = std::collections::HashMap::new();
479
-
480
- let cursor = Cursor::new(content_owned.clone());
481
- let mut archive = zip::ZipArchive::new(cursor).map_err(|e| {
482
- crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive for metadata: {}", e))
483
- })?;
484
-
485
- if let Ok(odt_props) = office_metadata::extract_odt_properties(&mut archive) {
486
- if let Some(title) = odt_props.title {
487
- metadata_map.insert("title".to_string(), serde_json::Value::String(title));
488
- }
489
- if let Some(creator) = odt_props.creator {
490
- metadata_map.insert(
491
- "authors".to_string(),
492
- serde_json::Value::Array(vec![serde_json::Value::String(creator.clone())]),
493
- );
494
- metadata_map.insert("created_by".to_string(), serde_json::Value::String(creator));
495
- }
496
- if let Some(initial_creator) = odt_props.initial_creator {
497
- metadata_map.insert(
498
- "initial_creator".to_string(),
499
- serde_json::Value::String(initial_creator),
500
- );
501
- }
502
- if let Some(subject) = odt_props.subject {
503
- metadata_map.insert("subject".to_string(), serde_json::Value::String(subject));
504
- }
505
- if let Some(keywords) = odt_props.keywords {
506
- metadata_map.insert("keywords".to_string(), serde_json::Value::String(keywords));
507
- }
508
- if let Some(description) = odt_props.description {
509
- metadata_map.insert("description".to_string(), serde_json::Value::String(description));
510
- }
511
- if let Some(creation_date) = odt_props.creation_date {
512
- metadata_map.insert("created_at".to_string(), serde_json::Value::String(creation_date));
513
- }
514
- if let Some(date) = odt_props.date {
515
- metadata_map.insert("modified_at".to_string(), serde_json::Value::String(date));
516
- }
517
- if let Some(language) = odt_props.language {
518
- metadata_map.insert("language".to_string(), serde_json::Value::String(language));
519
- }
520
- if let Some(generator) = odt_props.generator {
521
- metadata_map.insert("generator".to_string(), serde_json::Value::String(generator));
522
- }
523
- if let Some(editing_duration) = odt_props.editing_duration {
524
- metadata_map.insert(
525
- "editing_duration".to_string(),
526
- serde_json::Value::String(editing_duration),
527
- );
528
- }
529
- if let Some(editing_cycles) = odt_props.editing_cycles {
530
- metadata_map.insert("editing_cycles".to_string(), serde_json::Value::String(editing_cycles));
531
- }
532
- if let Some(page_count) = odt_props.page_count {
533
- metadata_map.insert("page_count".to_string(), serde_json::Value::Number(page_count.into()));
534
- }
535
- if let Some(word_count) = odt_props.word_count {
536
- metadata_map.insert("word_count".to_string(), serde_json::Value::Number(word_count.into()));
537
- }
538
- if let Some(character_count) = odt_props.character_count {
539
- metadata_map.insert(
540
- "character_count".to_string(),
541
- serde_json::Value::Number(character_count.into()),
542
- );
543
- }
544
- if let Some(paragraph_count) = odt_props.paragraph_count {
545
- metadata_map.insert(
546
- "paragraph_count".to_string(),
547
- serde_json::Value::Number(paragraph_count.into()),
548
- );
549
- }
550
- if let Some(table_count) = odt_props.table_count {
551
- metadata_map.insert("table_count".to_string(), serde_json::Value::Number(table_count.into()));
552
- }
553
- if let Some(image_count) = odt_props.image_count {
554
- metadata_map.insert("image_count".to_string(), serde_json::Value::Number(image_count.into()));
555
- }
556
- }
557
-
558
- Ok(ExtractionResult {
559
- content: text,
560
- mime_type: mime_type.to_string(),
561
- metadata: Metadata {
562
- additional: metadata_map,
563
- ..Default::default()
564
- },
565
- pages: None,
566
- tables,
567
- detected_languages: None,
568
- chunks: None,
569
- images: None,
570
- })
571
- }
572
-
573
- fn supported_mime_types(&self) -> &[&str] {
574
- &["application/vnd.oasis.opendocument.text"]
575
- }
576
-
577
- fn priority(&self) -> i32 {
578
- 60
579
- }
580
- }
581
-
582
- #[cfg(test)]
583
- mod tests {
584
- use super::*;
585
-
586
- #[tokio::test]
587
- async fn test_odt_extractor_plugin_interface() {
588
- let extractor = OdtExtractor::new();
589
- assert_eq!(extractor.name(), "odt-extractor");
590
- assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
591
- assert_eq!(extractor.priority(), 60);
592
- assert_eq!(extractor.supported_mime_types().len(), 1);
593
- }
594
-
595
- #[tokio::test]
596
- async fn test_odt_extractor_supports_odt() {
597
- let extractor = OdtExtractor::new();
598
- assert!(
599
- extractor
600
- .supported_mime_types()
601
- .contains(&"application/vnd.oasis.opendocument.text")
602
- );
603
- }
604
-
605
- #[tokio::test]
606
- async fn test_odt_extractor_default() {
607
- let extractor = OdtExtractor;
608
- assert_eq!(extractor.name(), "odt-extractor");
609
- }
610
-
611
- #[tokio::test]
612
- async fn test_odt_extractor_initialize_shutdown() {
613
- let extractor = OdtExtractor::new();
614
- assert!(extractor.initialize().is_ok());
615
- assert!(extractor.shutdown().is_ok());
616
- }
617
-
618
- #[test]
619
- fn test_extract_node_text_simple() {
620
- let xml = r#"<p xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0">Hello world</p>"#;
621
- let doc = roxmltree::Document::parse(xml).unwrap();
622
- let node = doc.root_element();
623
-
624
- let result = extract_node_text(node);
625
- assert!(result.is_some());
626
- assert!(!result.unwrap().is_empty());
627
- }
628
- }