kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -19,74 +19,13 @@ use crate::plugins::DocumentExtractor;
19
19
  use crate::types::ExtractionResult;
20
20
  #[cfg(feature = "office")]
21
21
  use crate::types::LibreOfficeConversionResult;
22
- use crate::utils::{PoolSizeHint, estimate_pool_size, intern_mime_type};
23
22
  use crate::{KreuzbergError, Result};
24
- #[cfg(feature = "tokio-runtime")]
25
23
  use once_cell::sync::Lazy;
26
24
  #[cfg(feature = "office")]
27
25
  use serde_json::json;
28
26
  use std::path::Path;
29
27
  use std::sync::Arc;
30
28
 
31
- /// Record error information in the current OpenTelemetry span.
32
- ///
33
- /// This function records error details in the current span when the `otel` feature is enabled.
34
- /// It marks the span with `otel.status_code=ERROR` and adds error type and message fields.
35
- ///
36
- /// # Arguments
37
- ///
38
- /// * `error` - The error to record in the span
39
- ///
40
- /// # Example
41
- ///
42
- /// ```rust,ignore
43
- /// let result = extract_file("doc.pdf", None, &config).await;
44
- /// #[cfg(feature = "otel")]
45
- /// if let Err(ref e) = result {
46
- /// record_error(e);
47
- /// }
48
- /// result
49
- /// ```
50
- #[cfg(feature = "otel")]
51
- fn record_error(error: &KreuzbergError) {
52
- let span = tracing::Span::current();
53
- span.record("otel.status_code", "ERROR");
54
- span.record("error.type", format!("{:?}", error));
55
- span.record("error.message", error.to_string());
56
- }
57
-
58
- /// Sanitize a file path to return only the filename.
59
- ///
60
- /// This function extracts the filename from a path to avoid recording
61
- /// potentially sensitive full file paths in telemetry data.
62
- ///
63
- /// # Arguments
64
- ///
65
- /// * `path` - The path to sanitize
66
- ///
67
- /// # Returns
68
- ///
69
- /// The filename as a string, or "unknown" if extraction fails
70
- ///
71
- /// # Security
72
- ///
73
- /// This prevents PII (personally identifiable information) from appearing in
74
- /// traces by only recording filenames instead of full paths.
75
- ///
76
- /// # Example
77
- ///
78
- /// ```rust,ignore
79
- /// let path = Path::new("/home/user/documents/secret.pdf");
80
- /// assert_eq!(sanitize_path(path), "secret.pdf");
81
- /// ```
82
- #[cfg(feature = "otel")]
83
- fn sanitize_path(path: &Path) -> String {
84
- path.file_name()
85
- .and_then(|n| n.to_str())
86
- .unwrap_or("unknown")
87
- .to_string()
88
- }
89
-
90
29
  /// Global Tokio runtime for synchronous operations.
91
30
  ///
92
31
  /// This runtime is lazily initialized on first use and shared across all sync wrappers.
@@ -99,12 +38,6 @@ fn sanitize_path(path: &Path) -> String {
99
38
  /// 2. If runtime creation fails, the process is already in a critical state
100
39
  /// 3. This is a one-time initialization - if it fails, nothing will work
101
40
  /// 4. Better to fail fast than return errors from every sync operation
102
- ///
103
- /// # Availability
104
- ///
105
- /// This static is only available when the `tokio-runtime` feature is enabled.
106
- /// For WASM targets, use the truly synchronous extraction functions instead.
107
- #[cfg(feature = "tokio-runtime")]
108
41
  static GLOBAL_RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
109
42
  tokio::runtime::Builder::new_multi_thread()
110
43
  .enable_all()
@@ -129,34 +62,6 @@ fn get_extractor(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
129
62
  registry_read.get(mime_type)
130
63
  }
131
64
 
132
- /// Get optimal pool sizing hint for a document.
133
- ///
134
- /// This function calculates recommended pool sizes based on the document's
135
- /// file size and MIME type. The hint can be used to create appropriately
136
- /// sized thread pools for extraction, reducing memory waste from over-allocation.
137
- ///
138
- /// # Arguments
139
- ///
140
- /// * `file_size` - The size of the file in bytes
141
- /// * `mime_type` - The MIME type of the document
142
- ///
143
- /// # Returns
144
- ///
145
- /// A `PoolSizeHint` with recommended pool configurations
146
- ///
147
- /// # Example
148
- ///
149
- /// ```rust,ignore
150
- /// use kreuzberg::core::extractor::get_pool_sizing_hint;
151
- ///
152
- /// let hint = get_pool_sizing_hint(5_000_000, "application/pdf");
153
- /// println!("Recommended string buffers: {}", hint.string_buffer_count);
154
- /// ```
155
- #[inline]
156
- pub fn get_pool_sizing_hint(file_size: u64, mime_type: &str) -> PoolSizeHint {
157
- estimate_pool_size(file_size, mime_type)
158
- }
159
-
160
65
  /// Extract content from a file.
161
66
  ///
162
67
  /// This is the main entry point for file-based extraction. It performs the following steps:
@@ -196,12 +101,6 @@ pub fn get_pool_sizing_hint(file_size: u64, mime_type: &str) -> PoolSizeHint {
196
101
  /// # Ok(())
197
102
  /// # }
198
103
  /// ```
199
- #[cfg_attr(feature = "otel", tracing::instrument(
200
- skip(config, path),
201
- fields(
202
- extraction.filename = tracing::field::Empty,
203
- )
204
- ))]
205
104
  pub async fn extract_file(
206
105
  path: impl AsRef<Path>,
207
106
  mime_type: Option<&str>,
@@ -211,119 +110,86 @@ pub async fn extract_file(
211
110
 
212
111
  let path = path.as_ref();
213
112
 
214
- #[cfg(feature = "otel")]
215
- {
216
- let span = tracing::Span::current();
217
- span.record("extraction.filename", sanitize_path(path));
218
- }
219
-
220
- let result = async {
221
- io::validate_file_exists(path)?;
113
+ io::validate_file_exists(path)?;
222
114
 
223
- let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
115
+ let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
224
116
 
225
- match detected_mime.as_str() {
226
- #[cfg(feature = "office")]
227
- LEGACY_WORD_MIME_TYPE => {
228
- let original_bytes = tokio::fs::read(path).await?;
229
- let conversion = convert_doc_to_docx(&original_bytes).await?;
230
- let mut result =
231
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
232
- apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
233
- return Ok(result);
234
- }
235
- #[cfg(not(feature = "office"))]
236
- LEGACY_WORD_MIME_TYPE => {
237
- return Err(KreuzbergError::UnsupportedFormat(
238
- "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
239
- ));
240
- }
241
- #[cfg(feature = "office")]
242
- LEGACY_POWERPOINT_MIME_TYPE => {
243
- let original_bytes = tokio::fs::read(path).await?;
244
- let conversion = convert_ppt_to_pptx(&original_bytes).await?;
245
- let mut result =
246
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
247
- apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
248
- return Ok(result);
249
- }
250
- #[cfg(not(feature = "office"))]
251
- LEGACY_POWERPOINT_MIME_TYPE => {
252
- return Err(KreuzbergError::UnsupportedFormat(
253
- "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
254
- ));
255
- }
256
- _ => {}
117
+ match detected_mime.as_str() {
118
+ #[cfg(feature = "office")]
119
+ LEGACY_WORD_MIME_TYPE => {
120
+ let original_bytes = tokio::fs::read(path).await?;
121
+ let conversion = convert_doc_to_docx(&original_bytes).await?;
122
+ let mut result =
123
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
124
+ apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
125
+ return Ok(result);
257
126
  }
258
-
259
- extract_file_with_extractor(path, &detected_mime, config).await
260
- }
261
- .await;
262
-
263
- #[cfg(feature = "otel")]
264
- if let Err(ref e) = result {
265
- record_error(e);
127
+ #[cfg(not(feature = "office"))]
128
+ LEGACY_WORD_MIME_TYPE => {
129
+ return Err(KreuzbergError::UnsupportedFormat(
130
+ "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
131
+ ));
132
+ }
133
+ #[cfg(feature = "office")]
134
+ LEGACY_POWERPOINT_MIME_TYPE => {
135
+ let original_bytes = tokio::fs::read(path).await?;
136
+ let conversion = convert_ppt_to_pptx(&original_bytes).await?;
137
+ let mut result =
138
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
139
+ apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
140
+ return Ok(result);
141
+ }
142
+ #[cfg(not(feature = "office"))]
143
+ LEGACY_POWERPOINT_MIME_TYPE => {
144
+ return Err(KreuzbergError::UnsupportedFormat(
145
+ "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
146
+ ));
147
+ }
148
+ _ => {}
266
149
  }
267
150
 
268
- result
151
+ extract_file_with_extractor(path, &detected_mime, config).await
269
152
  }
270
153
 
271
154
  /// Extract content from a byte array.
272
- #[cfg_attr(feature = "otel", tracing::instrument(
273
- skip(config, content),
274
- fields(
275
- extraction.mime_type = mime_type,
276
- extraction.size_bytes = content.len(),
277
- )
278
- ))]
279
155
  pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
280
156
  use crate::core::mime;
281
157
 
282
- let result = async {
283
- let validated_mime = mime::validate_mime_type(mime_type)?;
284
-
285
- match validated_mime.as_str() {
286
- #[cfg(feature = "office")]
287
- LEGACY_WORD_MIME_TYPE => {
288
- let conversion = convert_doc_to_docx(content).await?;
289
- let mut result =
290
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
291
- apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
292
- return Ok(result);
293
- }
294
- #[cfg(not(feature = "office"))]
295
- LEGACY_WORD_MIME_TYPE => {
296
- return Err(KreuzbergError::UnsupportedFormat(
297
- "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
298
- ));
299
- }
300
- #[cfg(feature = "office")]
301
- LEGACY_POWERPOINT_MIME_TYPE => {
302
- let conversion = convert_ppt_to_pptx(content).await?;
303
- let mut result =
304
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
305
- apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
306
- return Ok(result);
307
- }
308
- #[cfg(not(feature = "office"))]
309
- LEGACY_POWERPOINT_MIME_TYPE => {
310
- return Err(KreuzbergError::UnsupportedFormat(
311
- "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
312
- ));
313
- }
314
- _ => {}
315
- }
158
+ let validated_mime = mime::validate_mime_type(mime_type)?;
316
159
 
317
- extract_bytes_with_extractor(content, &validated_mime, config).await
318
- }
319
- .await;
320
-
321
- #[cfg(feature = "otel")]
322
- if let Err(ref e) = result {
323
- record_error(e);
160
+ match validated_mime.as_str() {
161
+ #[cfg(feature = "office")]
162
+ LEGACY_WORD_MIME_TYPE => {
163
+ let conversion = convert_doc_to_docx(content).await?;
164
+ let mut result =
165
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
166
+ apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
167
+ return Ok(result);
168
+ }
169
+ #[cfg(not(feature = "office"))]
170
+ LEGACY_WORD_MIME_TYPE => {
171
+ return Err(KreuzbergError::UnsupportedFormat(
172
+ "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
173
+ ));
174
+ }
175
+ #[cfg(feature = "office")]
176
+ LEGACY_POWERPOINT_MIME_TYPE => {
177
+ let conversion = convert_ppt_to_pptx(content).await?;
178
+ let mut result =
179
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
180
+ apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
181
+ return Ok(result);
182
+ }
183
+ #[cfg(not(feature = "office"))]
184
+ LEGACY_POWERPOINT_MIME_TYPE => {
185
+ return Err(KreuzbergError::UnsupportedFormat(
186
+ "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
187
+ ));
188
+ }
189
+ _ => {}
324
190
  }
325
191
 
326
- result
192
+ extract_bytes_with_extractor(content, &validated_mime, config).await
327
193
  }
328
194
 
329
195
  /// Extract content from multiple files concurrently.
@@ -346,13 +212,6 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
346
212
  ///
347
213
  /// Individual file errors are captured in the result metadata. System errors
348
214
  /// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
349
- #[cfg(feature = "tokio-runtime")]
350
- #[cfg_attr(feature = "otel", tracing::instrument(
351
- skip(config, paths),
352
- fields(
353
- extraction.batch_size = paths.len(),
354
- )
355
- ))]
356
215
  pub async fn batch_extract_file(
357
216
  paths: Vec<impl AsRef<Path>>,
358
217
  config: &ExtractionConfig,
@@ -367,9 +226,7 @@ pub async fn batch_extract_file(
367
226
 
368
227
  let config = Arc::new(config.clone());
369
228
 
370
- let max_concurrent = config
371
- .max_concurrent_extractions
372
- .unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
229
+ let max_concurrent = config.max_concurrent_extractions.unwrap_or_else(|| num_cpus::get() * 2);
373
230
  let semaphore = Arc::new(Semaphore::new(max_concurrent));
374
231
 
375
232
  let mut tasks = JoinSet::new();
@@ -396,8 +253,11 @@ pub async fn batch_extract_file(
396
253
  results[index] = Some(result);
397
254
  }
398
255
  Ok((index, Err(e))) => {
399
- // All errors (including Io) should create error results
400
- // instead of causing early return that abandons running tasks
256
+ // OSError/RuntimeError must bubble up - system errors need user reports ~keep
257
+ if matches!(e, KreuzbergError::Io(_)) {
258
+ return Err(e);
259
+ }
260
+
401
261
  use crate::types::{ErrorMetadata, Metadata};
402
262
  let metadata = Metadata {
403
263
  error: Some(ErrorMetadata {
@@ -415,7 +275,6 @@ pub async fn batch_extract_file(
415
275
  detected_languages: None,
416
276
  chunks: None,
417
277
  images: None,
418
- pages: None,
419
278
  });
420
279
  }
421
280
  Err(join_err) => {
@@ -443,15 +302,8 @@ pub async fn batch_extract_file(
443
302
  /// # Returns
444
303
  ///
445
304
  /// A vector of `ExtractionResult` in the same order as the input.
446
- #[cfg(feature = "tokio-runtime")]
447
- #[cfg_attr(feature = "otel", tracing::instrument(
448
- skip(config, contents),
449
- fields(
450
- extraction.batch_size = contents.len(),
451
- )
452
- ))]
453
305
  pub async fn batch_extract_bytes(
454
- contents: Vec<(Vec<u8>, String)>,
306
+ contents: Vec<(&[u8], &str)>,
455
307
  config: &ExtractionConfig,
456
308
  ) -> Result<Vec<ExtractionResult>> {
457
309
  use std::sync::Arc;
@@ -465,14 +317,17 @@ pub async fn batch_extract_bytes(
465
317
  let batch_config = config.clone();
466
318
  let config = Arc::new(batch_config);
467
319
 
468
- let max_concurrent = config
469
- .max_concurrent_extractions
470
- .unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
320
+ let max_concurrent = config.max_concurrent_extractions.unwrap_or_else(|| num_cpus::get() * 2);
471
321
  let semaphore = Arc::new(Semaphore::new(max_concurrent));
472
322
 
323
+ let owned_contents: Vec<(Vec<u8>, String)> = contents
324
+ .into_iter()
325
+ .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
326
+ .collect();
327
+
473
328
  let mut tasks = JoinSet::new();
474
329
 
475
- for (index, (bytes, mime_type)) in contents.into_iter().enumerate() {
330
+ for (index, (bytes, mime_type)) in owned_contents.into_iter().enumerate() {
476
331
  let config_clone = Arc::clone(&config);
477
332
  let semaphore_clone = Arc::clone(&semaphore);
478
333
 
@@ -494,8 +349,11 @@ pub async fn batch_extract_bytes(
494
349
  results[index] = Some(result);
495
350
  }
496
351
  Ok((index, Err(e))) => {
497
- // All errors (including Io) should create error results
498
- // instead of causing early return that abandons running tasks
352
+ // OSError/RuntimeError must bubble up - system errors need user reports ~keep
353
+ if matches!(e, KreuzbergError::Io(_)) {
354
+ return Err(e);
355
+ }
356
+
499
357
  use crate::types::{ErrorMetadata, Metadata};
500
358
  let metadata = Metadata {
501
359
  error: Some(ErrorMetadata {
@@ -513,7 +371,6 @@ pub async fn batch_extract_bytes(
513
371
  detected_languages: None,
514
372
  chunks: None,
515
373
  images: None,
516
- pages: None,
517
374
  });
518
375
  }
519
376
  Err(join_err) => {
@@ -533,10 +390,6 @@ pub async fn batch_extract_bytes(
533
390
  ///
534
391
  /// Uses the global Tokio runtime for 100x+ performance improvement over creating
535
392
  /// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
536
- ///
537
- /// This function is only available with the `tokio-runtime` feature. For WASM targets,
538
- /// use a truly synchronous extraction approach instead.
539
- #[cfg(feature = "tokio-runtime")]
540
393
  pub fn extract_file_sync(
541
394
  path: impl AsRef<Path>,
542
395
  mime_type: Option<&str>,
@@ -549,31 +402,14 @@ pub fn extract_file_sync(
549
402
  ///
550
403
  /// Uses the global Tokio runtime for 100x+ performance improvement over creating
551
404
  /// a new runtime per call.
552
- ///
553
- /// With the `tokio-runtime` feature, this blocks the current thread using the global
554
- /// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
555
- #[cfg(feature = "tokio-runtime")]
556
405
  pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
557
406
  GLOBAL_RUNTIME.block_on(extract_bytes(content, mime_type, config))
558
407
  }
559
408
 
560
- /// Synchronous wrapper for `extract_bytes` (WASM-compatible version).
561
- ///
562
- /// This is a truly synchronous implementation without tokio runtime dependency.
563
- /// It calls `extract_bytes_sync_impl()` to perform the extraction.
564
- #[cfg(not(feature = "tokio-runtime"))]
565
- pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
566
- extract_bytes_sync_impl(content.to_vec(), Some(mime_type.to_string()), Some(config.clone()))
567
- }
568
-
569
409
  /// Synchronous wrapper for `batch_extract_file`.
570
410
  ///
571
411
  /// Uses the global Tokio runtime for 100x+ performance improvement over creating
572
412
  /// a new runtime per call.
573
- ///
574
- /// This function is only available with the `tokio-runtime` feature. For WASM targets,
575
- /// use a truly synchronous extraction approach instead.
576
- #[cfg(feature = "tokio-runtime")]
577
413
  pub fn batch_extract_file_sync(
578
414
  paths: Vec<impl AsRef<Path>>,
579
415
  config: &ExtractionConfig,
@@ -585,109 +421,13 @@ pub fn batch_extract_file_sync(
585
421
  ///
586
422
  /// Uses the global Tokio runtime for 100x+ performance improvement over creating
587
423
  /// a new runtime per call.
588
- ///
589
- /// With the `tokio-runtime` feature, this blocks the current thread using the global
590
- /// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
591
- /// that iterates through items and calls `extract_bytes_sync()`.
592
- #[cfg(feature = "tokio-runtime")]
593
424
  pub fn batch_extract_bytes_sync(
594
- contents: Vec<(Vec<u8>, String)>,
425
+ contents: Vec<(&[u8], &str)>,
595
426
  config: &ExtractionConfig,
596
427
  ) -> Result<Vec<ExtractionResult>> {
597
428
  GLOBAL_RUNTIME.block_on(batch_extract_bytes(contents, config))
598
429
  }
599
430
 
600
- /// Synchronous wrapper for `batch_extract_bytes` (WASM-compatible version).
601
- ///
602
- /// This is a truly synchronous implementation that iterates through items
603
- /// and calls `extract_bytes_sync()` for each.
604
- #[cfg(not(feature = "tokio-runtime"))]
605
- pub fn batch_extract_bytes_sync(
606
- contents: Vec<(Vec<u8>, String)>,
607
- config: &ExtractionConfig,
608
- ) -> Result<Vec<ExtractionResult>> {
609
- let mut results = Vec::with_capacity(contents.len());
610
- for (content, mime_type) in contents {
611
- let result = extract_bytes_sync(&content, &mime_type, config);
612
- results.push(result.unwrap_or_else(|e| {
613
- use crate::types::{ErrorMetadata, Metadata};
614
- ExtractionResult {
615
- content: format!("Error: {}", e),
616
- mime_type: pool_mime_type("text/plain"),
617
- metadata: Metadata {
618
- error: Some(ErrorMetadata {
619
- error_type: format!("{:?}", e),
620
- message: e.to_string(),
621
- }),
622
- ..Default::default()
623
- },
624
- tables: vec![],
625
- detected_languages: None,
626
- chunks: None,
627
- images: None,
628
- pages: None,
629
- }
630
- }));
631
- }
632
- Ok(results)
633
- }
634
-
635
- /// Synchronous extraction implementation for WASM compatibility.
636
- ///
637
- /// This function performs extraction without requiring a tokio runtime.
638
- /// It calls the sync extractor methods directly.
639
- ///
640
- /// # Arguments
641
- ///
642
- /// * `content` - The byte content to extract
643
- /// * `mime_type` - Optional MIME type to validate/use
644
- /// * `config` - Optional extraction configuration
645
- ///
646
- /// # Returns
647
- ///
648
- /// An `ExtractionResult` or a `KreuzbergError`
649
- ///
650
- /// # Implementation Notes
651
- ///
652
- /// This is called when the `tokio-runtime` feature is disabled.
653
- /// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
654
- #[cfg(not(feature = "tokio-runtime"))]
655
- fn extract_bytes_sync_impl(
656
- content: Vec<u8>,
657
- mime_type: Option<String>,
658
- config: Option<ExtractionConfig>,
659
- ) -> Result<ExtractionResult> {
660
- use crate::core::mime;
661
-
662
- let config = config.unwrap_or_default();
663
-
664
- let validated_mime = if let Some(mime) = mime_type {
665
- mime::validate_mime_type(&mime)?
666
- } else {
667
- return Err(KreuzbergError::Validation {
668
- message: "MIME type is required for synchronous extraction".to_string(),
669
- source: None,
670
- });
671
- };
672
-
673
- crate::extractors::ensure_initialized()?;
674
-
675
- let extractor = get_extractor(&validated_mime)?;
676
-
677
- let sync_extractor = extractor.as_sync_extractor().ok_or_else(|| {
678
- KreuzbergError::UnsupportedFormat(format!(
679
- "Extractor for '{}' does not support synchronous extraction",
680
- validated_mime
681
- ))
682
- })?;
683
-
684
- let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
685
-
686
- result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
687
-
688
- Ok(result)
689
- }
690
-
691
431
  async fn extract_file_with_extractor(
692
432
  path: &Path,
693
433
  mime_type: &str,
@@ -714,29 +454,13 @@ async fn extract_bytes_with_extractor(
714
454
  Ok(result)
715
455
  }
716
456
 
717
- /// Convert a MIME type string to a pooled String for efficient deduplication.
718
- ///
719
- /// This function uses the string interning pool to reduce memory allocations
720
- /// for repeatedly used MIME types (e.g., "application/pdf" appears thousands of times
721
- /// in batch processing). The interned string is converted to an owned String to satisfy
722
- /// the ExtractionResult::mime_type field type.
723
- ///
724
- /// # Performance
725
- ///
726
- /// For pre-interned MIME types (all common types), this is O(1) pointer dereference.
727
- /// For unknown MIME types, this allocates once per unique type and caches the result.
728
- #[allow(dead_code)]
729
- fn pool_mime_type(mime_type: &str) -> String {
730
- intern_mime_type(mime_type).to_string()
731
- }
732
-
733
457
  #[cfg(feature = "office")]
734
458
  fn apply_libreoffice_metadata(
735
459
  result: &mut ExtractionResult,
736
460
  legacy_mime: &str,
737
461
  conversion: &LibreOfficeConversionResult,
738
462
  ) {
739
- result.mime_type = pool_mime_type(legacy_mime);
463
+ result.mime_type = legacy_mime.to_string();
740
464
  result.metadata.additional.insert(
741
465
  "libreoffice_conversion".to_string(),
742
466
  json!({
@@ -756,10 +480,6 @@ mod tests {
756
480
  use std::io::Write;
757
481
  use tempfile::tempdir;
758
482
 
759
- fn assert_text_content(actual: &str, expected: &str) {
760
- assert_eq!(actual.trim_end_matches('\n'), expected);
761
- }
762
-
763
483
  #[tokio::test]
764
484
  async fn test_extract_file_basic() {
765
485
  let dir = tempdir().unwrap();
@@ -772,7 +492,7 @@ mod tests {
772
492
 
773
493
  assert!(result.is_ok());
774
494
  let result = result.unwrap();
775
- assert_text_content(&result.content, "Hello, world!");
495
+ assert_eq!(result.content, "Hello, world!");
776
496
  assert_eq!(result.mime_type, "text/plain");
777
497
  }
778
498
 
@@ -805,7 +525,7 @@ mod tests {
805
525
 
806
526
  assert!(result.is_ok());
807
527
  let result = result.unwrap();
808
- assert_text_content(&result.content, "test content");
528
+ assert_eq!(result.content, "test content");
809
529
  assert_eq!(result.mime_type, "text/plain");
810
530
  }
811
531
 
@@ -833,8 +553,8 @@ mod tests {
833
553
  assert!(results.is_ok());
834
554
  let results = results.unwrap();
835
555
  assert_eq!(results.len(), 2);
836
- assert_text_content(&results[0].content, "content 1");
837
- assert_text_content(&results[1].content, "content 2");
556
+ assert_eq!(results[0].content, "content 1");
557
+ assert_eq!(results[1].content, "content 2");
838
558
  }
839
559
 
840
560
  #[tokio::test]
@@ -854,17 +574,13 @@ mod tests {
854
574
  (b"content 1".as_slice(), "text/plain"),
855
575
  (b"content 2".as_slice(), "text/plain"),
856
576
  ];
857
- let owned_contents: Vec<(Vec<u8>, String)> = contents
858
- .into_iter()
859
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
860
- .collect();
861
- let results = batch_extract_bytes(owned_contents, &config).await;
577
+ let results = batch_extract_bytes(contents, &config).await;
862
578
 
863
579
  assert!(results.is_ok());
864
580
  let results = results.unwrap();
865
581
  assert_eq!(results.len(), 2);
866
- assert_text_content(&results[0].content, "content 1");
867
- assert_text_content(&results[1].content, "content 2");
582
+ assert_eq!(results[0].content, "content 1");
583
+ assert_eq!(results[1].content, "content 2");
868
584
  }
869
585
 
870
586
  #[test]
@@ -877,8 +593,7 @@ mod tests {
877
593
 
878
594
  let result = extract_file_sync(&file_path, None, &config);
879
595
  assert!(result.is_ok());
880
- let result = result.unwrap();
881
- assert_text_content(&result.content, "sync test");
596
+ assert_eq!(result.unwrap().content, "sync test");
882
597
 
883
598
  let result = extract_bytes_sync(b"test", "text/plain", &config);
884
599
  assert!(result.is_ok());
@@ -890,14 +605,12 @@ mod tests {
890
605
 
891
606
  let result1 = extract_bytes(b"test 1", "text/plain", &config).await;
892
607
  assert!(result1.is_ok());
893
- let result1 = result1.unwrap();
894
608
 
895
609
  let result2 = extract_bytes(b"test 2", "text/plain", &config).await;
896
610
  assert!(result2.is_ok());
897
- let result2 = result2.unwrap();
898
611
 
899
- assert_text_content(&result1.content, "test 1");
900
- assert_text_content(&result2.content, "test 2");
612
+ assert_eq!(result1.unwrap().content, "test 1");
613
+ assert_eq!(result2.unwrap().content, "test 2");
901
614
 
902
615
  let result3 = extract_bytes(b"# test 3", "text/markdown", &config).await;
903
616
  assert!(result3.is_ok());
@@ -963,8 +676,7 @@ mod tests {
963
676
  let result = extract_file(&file_path, None, &config).await;
964
677
 
965
678
  assert!(result.is_ok());
966
- let result = result.unwrap();
967
- assert_text_content(&result.content, "content");
679
+ assert_eq!(result.unwrap().content, "content");
968
680
  }
969
681
 
970
682
  #[tokio::test]
@@ -1004,7 +716,7 @@ mod tests {
1004
716
  assert!(results.is_ok());
1005
717
  let results = results.unwrap();
1006
718
  assert_eq!(results.len(), 2);
1007
- assert_text_content(&results[0].content, "valid content");
719
+ assert_eq!(results[0].content, "valid content");
1008
720
  assert!(results[1].metadata.error.is_some());
1009
721
  }
1010
722
 
@@ -1016,18 +728,14 @@ mod tests {
1016
728
  (b"invalid".as_slice(), "invalid/mime"),
1017
729
  (b"valid 2".as_slice(), "text/plain"),
1018
730
  ];
1019
- let owned_contents: Vec<(Vec<u8>, String)> = contents
1020
- .into_iter()
1021
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
1022
- .collect();
1023
- let results = batch_extract_bytes(owned_contents, &config).await;
731
+ let results = batch_extract_bytes(contents, &config).await;
1024
732
 
1025
733
  assert!(results.is_ok());
1026
734
  let results = results.unwrap();
1027
735
  assert_eq!(results.len(), 3);
1028
- assert_text_content(&results[0].content, "valid 1");
736
+ assert_eq!(results[0].content, "valid 1");
1029
737
  assert!(results[1].metadata.error.is_some());
1030
- assert_text_content(&results[2].content, "valid 2");
738
+ assert_eq!(results[2].content, "valid 2");
1031
739
  }
1032
740
 
1033
741
  #[tokio::test]
@@ -1037,11 +745,7 @@ mod tests {
1037
745
  (b"test 1".as_slice(), "invalid/mime1"),
1038
746
  (b"test 2".as_slice(), "invalid/mime2"),
1039
747
  ];
1040
- let owned_contents: Vec<(Vec<u8>, String)> = contents
1041
- .into_iter()
1042
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
1043
- .collect();
1044
- let results = batch_extract_bytes(owned_contents, &config).await;
748
+ let results = batch_extract_bytes(contents, &config).await;
1045
749
 
1046
750
  assert!(results.is_ok());
1047
751
  let results = results.unwrap();
@@ -1058,8 +762,7 @@ mod tests {
1058
762
 
1059
763
  assert!(result.is_ok());
1060
764
  let result = result.unwrap();
1061
- let trimmed_len = result.content.trim_end_matches('\n').len();
1062
- assert_eq!(trimmed_len, 10_000_000);
765
+ assert_eq!(result.content.len(), 10_000_000);
1063
766
  }
1064
767
 
1065
768
  #[tokio::test]
@@ -1084,7 +787,7 @@ mod tests {
1084
787
  assert_eq!(results.len(), 100);
1085
788
 
1086
789
  for (i, result) in results.iter().enumerate() {
1087
- assert_text_content(&result.content, &format!("content {}", i));
790
+ assert_eq!(result.content, format!("content {}", i));
1088
791
  }
1089
792
  }
1090
793
 
@@ -1137,7 +840,7 @@ mod tests {
1137
840
  #[test]
1138
841
  fn test_sync_wrapper_batch_bytes_empty() {
1139
842
  let config = ExtractionConfig::default();
1140
- let contents: Vec<(Vec<u8>, String)> = vec![];
843
+ let contents: Vec<(&[u8], &str)> = vec![];
1141
844
  let results = batch_extract_bytes_sync(contents, &config);
1142
845
 
1143
846
  assert!(results.is_ok());