kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +538 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +105 -2
  8. data/README.md +454 -454
  9. data/Rakefile +33 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -341
  12. data/ext/kreuzberg_rb/extconf.rb +45 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
  14. data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
  15. data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
  23. data/extconf.rb +28 -28
  24. data/kreuzberg.gemspec +214 -214
  25. data/lib/kreuzberg/api_proxy.rb +142 -142
  26. data/lib/kreuzberg/cache_api.rb +81 -81
  27. data/lib/kreuzberg/cli.rb +55 -55
  28. data/lib/kreuzberg/cli_proxy.rb +127 -127
  29. data/lib/kreuzberg/config.rb +724 -724
  30. data/lib/kreuzberg/error_context.rb +80 -80
  31. data/lib/kreuzberg/errors.rb +118 -118
  32. data/lib/kreuzberg/extraction_api.rb +340 -340
  33. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  36. data/lib/kreuzberg/result.rb +279 -279
  37. data/lib/kreuzberg/setup_lib_path.rb +80 -80
  38. data/lib/kreuzberg/validator_protocol.rb +89 -89
  39. data/lib/kreuzberg/version.rb +5 -5
  40. data/lib/kreuzberg.rb +109 -109
  41. data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
  42. data/sig/kreuzberg/internal.rbs +184 -184
  43. data/sig/kreuzberg.rbs +546 -546
  44. data/spec/binding/cache_spec.rb +227 -227
  45. data/spec/binding/cli_proxy_spec.rb +85 -85
  46. data/spec/binding/cli_spec.rb +55 -55
  47. data/spec/binding/config_spec.rb +345 -345
  48. data/spec/binding/config_validation_spec.rb +283 -283
  49. data/spec/binding/error_handling_spec.rb +213 -213
  50. data/spec/binding/errors_spec.rb +66 -66
  51. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  52. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  53. data/spec/binding/plugins/validator_spec.rb +274 -274
  54. data/spec/fixtures/config.toml +39 -39
  55. data/spec/fixtures/config.yaml +41 -41
  56. data/spec/fixtures/invalid_config.toml +4 -4
  57. data/spec/smoke/package_spec.rb +178 -178
  58. data/spec/spec_helper.rb +42 -42
  59. data/vendor/Cargo.toml +1 -1
  60. data/vendor/kreuzberg/Cargo.toml +5 -5
  61. data/vendor/kreuzberg/README.md +230 -230
  62. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
  63. data/vendor/kreuzberg/build.rs +843 -843
  64. data/vendor/kreuzberg/src/api/error.rs +81 -81
  65. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  66. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  67. data/vendor/kreuzberg/src/api/server.rs +353 -353
  68. data/vendor/kreuzberg/src/api/types.rs +170 -170
  69. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  70. data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
  71. data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
  72. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  73. data/vendor/kreuzberg/src/core/config.rs +1080 -1080
  74. data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
  75. data/vendor/kreuzberg/src/core/io.rs +329 -329
  76. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  77. data/vendor/kreuzberg/src/core/mod.rs +47 -47
  78. data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
  79. data/vendor/kreuzberg/src/embeddings.rs +500 -500
  80. data/vendor/kreuzberg/src/error.rs +431 -431
  81. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  82. data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
  83. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  84. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  85. data/vendor/kreuzberg/src/extraction/html.rs +601 -601
  86. data/vendor/kreuzberg/src/extraction/image.rs +491 -491
  87. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
  88. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
  89. data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
  90. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  91. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  92. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  93. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  94. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
  95. data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
  96. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  97. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  98. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  99. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  100. data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
  101. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
  102. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
  103. data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
  104. data/vendor/kreuzberg/src/extractors/email.rs +157 -157
  105. data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
  106. data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
  107. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
  108. data/vendor/kreuzberg/src/extractors/html.rs +407 -407
  109. data/vendor/kreuzberg/src/extractors/image.rs +219 -219
  110. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
  112. data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
  114. data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
  115. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  116. data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
  117. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
  118. data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
  119. data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
  120. data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
  121. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
  122. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  123. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  124. data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
  125. data/vendor/kreuzberg/src/extractors/text.rs +265 -265
  126. data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
  127. data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
  128. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  129. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  130. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  131. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  132. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  133. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  134. data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
  135. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  136. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  137. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  138. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
  139. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
  140. data/vendor/kreuzberg/src/lib.rs +113 -113
  141. data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
  142. data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
  143. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  144. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  145. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  146. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  147. data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
  148. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  149. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  150. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
  151. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  152. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  153. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  154. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  155. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
  156. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
  157. data/vendor/kreuzberg/src/pdf/error.rs +130 -130
  158. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  159. data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
  160. data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
  161. data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
  162. data/vendor/kreuzberg/src/pdf/table.rs +420 -420
  163. data/vendor/kreuzberg/src/pdf/text.rs +240 -240
  164. data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
  165. data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
  166. data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
  167. data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
  168. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
  169. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  170. data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
  171. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  172. data/vendor/kreuzberg/src/text/mod.rs +25 -25
  173. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  174. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
  175. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  176. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  177. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  178. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  179. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  180. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  181. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  182. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  183. data/vendor/kreuzberg/src/types.rs +1055 -1055
  184. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  185. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  186. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  187. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  188. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  189. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  190. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  191. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  192. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  193. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  194. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  195. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  196. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  198. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  199. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  200. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  201. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  202. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  203. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  204. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  205. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  206. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  207. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  208. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  209. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  210. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  211. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  212. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  213. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  214. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  215. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  216. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  217. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  218. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  219. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  220. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  221. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  222. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  223. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  224. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  225. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  226. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  227. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  228. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  229. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  230. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  231. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  232. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  233. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  234. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  235. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  236. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  237. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  238. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  239. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  240. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  241. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  242. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  243. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  244. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  245. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  246. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  247. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  248. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  249. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  250. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  251. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  252. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  253. data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
  254. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
  255. data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
  256. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  257. data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
  258. data/vendor/kreuzberg/tests/config_features.rs +612 -612
  259. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
  260. data/vendor/kreuzberg/tests/core_integration.rs +510 -510
  261. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  262. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
  263. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  264. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  265. data/vendor/kreuzberg/tests/email_integration.rs +327 -327
  266. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  267. data/vendor/kreuzberg/tests/error_handling.rs +402 -402
  268. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  269. data/vendor/kreuzberg/tests/format_integration.rs +164 -164
  270. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  271. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  272. data/vendor/kreuzberg/tests/image_integration.rs +255 -255
  273. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  274. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  275. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  276. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  277. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  278. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  279. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  280. data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
  281. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
  282. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
  283. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
  284. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  285. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
  286. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  287. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  288. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
  289. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
  290. data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
  291. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
  292. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
  293. data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
  294. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  295. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
  296. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
  297. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
  298. data/vendor/kreuzberg/tests/security_validation.rs +416 -416
  299. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  300. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
  301. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
  302. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
  303. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  304. data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
  305. data/vendor/kreuzberg-ffi/README.md +851 -851
  306. data/vendor/kreuzberg-ffi/build.rs +176 -176
  307. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
  308. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
  309. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
  310. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
  311. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
  312. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
  313. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
  314. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
  315. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
  316. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
  317. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
  318. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
  319. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
  320. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  321. data/vendor/kreuzberg-tesseract/LICENSE +22 -22
  322. data/vendor/kreuzberg-tesseract/README.md +399 -399
  323. data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
  324. data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
  325. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
  326. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
  327. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
  328. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
  329. data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
  330. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
  331. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
  332. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
  333. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
  334. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
  335. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
  336. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
  337. data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
  338. data/vendor/rb-sys/Cargo.lock +393 -393
  339. data/vendor/rb-sys/Cargo.toml +70 -70
  340. data/vendor/rb-sys/Cargo.toml.orig +57 -57
  341. data/vendor/rb-sys/LICENSE-APACHE +190 -190
  342. data/vendor/rb-sys/LICENSE-MIT +21 -21
  343. data/vendor/rb-sys/build/features.rs +111 -111
  344. data/vendor/rb-sys/build/main.rs +286 -286
  345. data/vendor/rb-sys/build/stable_api_config.rs +155 -155
  346. data/vendor/rb-sys/build/version.rs +50 -50
  347. data/vendor/rb-sys/readme.md +36 -36
  348. data/vendor/rb-sys/src/bindings.rs +21 -21
  349. data/vendor/rb-sys/src/hidden.rs +11 -11
  350. data/vendor/rb-sys/src/lib.rs +35 -35
  351. data/vendor/rb-sys/src/macros.rs +371 -371
  352. data/vendor/rb-sys/src/memory.rs +53 -53
  353. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
  354. data/vendor/rb-sys/src/special_consts.rs +31 -31
  355. data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
  356. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
  357. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
  358. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
  359. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
  360. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
  361. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
  362. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
  363. data/vendor/rb-sys/src/stable_api.rs +260 -260
  364. data/vendor/rb-sys/src/symbol.rs +31 -31
  365. data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
  366. data/vendor/rb-sys/src/utils.rs +89 -89
  367. data/vendor/rb-sys/src/value_type.rs +7 -7
  368. metadata +73 -4
  369. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
@@ -1,1156 +1,1156 @@
1
- //! Main extraction entry points.
2
- //!
3
- //! This module provides the primary API for extracting content from files and byte arrays.
4
- //! It orchestrates the entire extraction pipeline: cache checking, MIME detection,
5
- //! extractor selection, extraction, post-processing, and cache storage.
6
- //!
7
- //! # Functions
8
- //!
9
- //! - [`extract_file`] - Extract content from a file path
10
- //! - [`extract_bytes`] - Extract content from a byte array
11
- //! - [`batch_extract_file`] - Extract content from multiple files concurrently
12
- //! - [`batch_extract_bytes`] - Extract content from multiple byte arrays concurrently
13
-
14
- use crate::core::config::ExtractionConfig;
15
- use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
16
- #[cfg(feature = "office")]
17
- use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
18
- use crate::plugins::DocumentExtractor;
19
- use crate::types::ExtractionResult;
20
- #[cfg(feature = "office")]
21
- use crate::types::LibreOfficeConversionResult;
22
- use crate::{KreuzbergError, Result};
23
- #[cfg(feature = "tokio-runtime")]
24
- use once_cell::sync::Lazy;
25
- #[cfg(feature = "office")]
26
- use serde_json::json;
27
- use std::path::Path;
28
- use std::sync::Arc;
29
-
30
- /// Record error information in the current OpenTelemetry span.
31
- ///
32
- /// This function records error details in the current span when the `otel` feature is enabled.
33
- /// It marks the span with `otel.status_code=ERROR` and adds error type and message fields.
34
- ///
35
- /// # Arguments
36
- ///
37
- /// * `error` - The error to record in the span
38
- ///
39
- /// # Example
40
- ///
41
- /// ```rust,ignore
42
- /// let result = extract_file("doc.pdf", None, &config).await;
43
- /// #[cfg(feature = "otel")]
44
- /// if let Err(ref e) = result {
45
- /// record_error(e);
46
- /// }
47
- /// result
48
- /// ```
49
- #[cfg(feature = "otel")]
50
- fn record_error(error: &KreuzbergError) {
51
- let span = tracing::Span::current();
52
- span.record("otel.status_code", "ERROR");
53
- span.record("error.type", format!("{:?}", error));
54
- span.record("error.message", error.to_string());
55
- }
56
-
57
- /// Sanitize a file path to return only the filename.
58
- ///
59
- /// This function extracts the filename from a path to avoid recording
60
- /// potentially sensitive full file paths in telemetry data.
61
- ///
62
- /// # Arguments
63
- ///
64
- /// * `path` - The path to sanitize
65
- ///
66
- /// # Returns
67
- ///
68
- /// The filename as a string, or "unknown" if extraction fails
69
- ///
70
- /// # Security
71
- ///
72
- /// This prevents PII (personally identifiable information) from appearing in
73
- /// traces by only recording filenames instead of full paths.
74
- ///
75
- /// # Example
76
- ///
77
- /// ```rust,ignore
78
- /// let path = Path::new("/home/user/documents/secret.pdf");
79
- /// assert_eq!(sanitize_path(path), "secret.pdf");
80
- /// ```
81
- #[cfg(feature = "otel")]
82
- fn sanitize_path(path: &Path) -> String {
83
- path.file_name()
84
- .and_then(|n| n.to_str())
85
- .unwrap_or("unknown")
86
- .to_string()
87
- }
88
-
89
- /// Global Tokio runtime for synchronous operations.
90
- ///
91
- /// This runtime is lazily initialized on first use and shared across all sync wrappers.
92
- /// Using a global runtime instead of creating one per call provides 100x+ performance improvement.
93
- ///
94
- /// # Safety
95
- ///
96
- /// The `.expect()` here is justified because:
97
- /// 1. Runtime creation can only fail due to system resource exhaustion (OOM, thread limit)
98
- /// 2. If runtime creation fails, the process is already in a critical state
99
- /// 3. This is a one-time initialization - if it fails, nothing will work
100
- /// 4. Better to fail fast than return errors from every sync operation
101
- ///
102
- /// # Availability
103
- ///
104
- /// This static is only available when the `tokio-runtime` feature is enabled.
105
- /// For WASM targets, use the truly synchronous extraction functions instead.
106
- #[cfg(feature = "tokio-runtime")]
107
- static GLOBAL_RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
108
- tokio::runtime::Builder::new_multi_thread()
109
- .enable_all()
110
- .build()
111
- .expect("Failed to create global Tokio runtime - system may be out of resources")
112
- });
113
-
114
- /// Get an extractor from the registry.
115
- ///
116
- /// This function acquires the registry read lock and retrieves the appropriate
117
- /// extractor for the given MIME type.
118
- ///
119
- /// # Performance
120
- ///
121
- /// RwLock read + HashMap lookup is ~100ns, fast enough without caching.
122
- /// Removed thread-local cache to avoid Tokio work-stealing scheduler issues.
123
- fn get_extractor(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
124
- let registry = crate::plugins::registry::get_document_extractor_registry();
125
- let registry_read = registry
126
- .read()
127
- .map_err(|e| KreuzbergError::Other(format!("Document extractor registry lock poisoned: {}", e)))?;
128
- registry_read.get(mime_type)
129
- }
130
-
131
- /// Extract content from a file.
132
- ///
133
- /// This is the main entry point for file-based extraction. It performs the following steps:
134
- /// 1. Check cache for existing result (if caching enabled)
135
- /// 2. Detect or validate MIME type
136
- /// 3. Select appropriate extractor from registry
137
- /// 4. Extract content
138
- /// 5. Run post-processing pipeline
139
- /// 6. Store result in cache (if caching enabled)
140
- ///
141
- /// # Arguments
142
- ///
143
- /// * `path` - Path to the file to extract
144
- /// * `mime_type` - Optional MIME type override. If None, will be auto-detected
145
- /// * `config` - Extraction configuration
146
- ///
147
- /// # Returns
148
- ///
149
- /// An `ExtractionResult` containing the extracted content and metadata.
150
- ///
151
- /// # Errors
152
- ///
153
- /// Returns `KreuzbergError::Validation` if the file doesn't exist or path is invalid.
154
- /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
155
- /// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
156
- ///
157
- /// # Example
158
- ///
159
- /// ```rust,no_run
160
- /// use kreuzberg::core::extractor::extract_file;
161
- /// use kreuzberg::core::config::ExtractionConfig;
162
- ///
163
- /// # async fn example() -> kreuzberg::Result<()> {
164
- /// let config = ExtractionConfig::default();
165
- /// let result = extract_file("document.pdf", None, &config).await?;
166
- /// println!("Content: {}", result.content);
167
- /// # Ok(())
168
- /// # }
169
- /// ```
170
- #[cfg_attr(feature = "otel", tracing::instrument(
171
- skip(config, path),
172
- fields(
173
- extraction.filename = tracing::field::Empty,
174
- )
175
- ))]
176
- pub async fn extract_file(
177
- path: impl AsRef<Path>,
178
- mime_type: Option<&str>,
179
- config: &ExtractionConfig,
180
- ) -> Result<ExtractionResult> {
181
- use crate::core::{io, mime};
182
-
183
- let path = path.as_ref();
184
-
185
- #[cfg(feature = "otel")]
186
- {
187
- let span = tracing::Span::current();
188
- span.record("extraction.filename", sanitize_path(path));
189
- }
190
-
191
- let result = async {
192
- io::validate_file_exists(path)?;
193
-
194
- let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
195
-
196
- match detected_mime.as_str() {
197
- #[cfg(feature = "office")]
198
- LEGACY_WORD_MIME_TYPE => {
199
- let original_bytes = tokio::fs::read(path).await?;
200
- let conversion = convert_doc_to_docx(&original_bytes).await?;
201
- let mut result =
202
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
203
- apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
204
- return Ok(result);
205
- }
206
- #[cfg(not(feature = "office"))]
207
- LEGACY_WORD_MIME_TYPE => {
208
- return Err(KreuzbergError::UnsupportedFormat(
209
- "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
210
- ));
211
- }
212
- #[cfg(feature = "office")]
213
- LEGACY_POWERPOINT_MIME_TYPE => {
214
- let original_bytes = tokio::fs::read(path).await?;
215
- let conversion = convert_ppt_to_pptx(&original_bytes).await?;
216
- let mut result =
217
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
218
- apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
219
- return Ok(result);
220
- }
221
- #[cfg(not(feature = "office"))]
222
- LEGACY_POWERPOINT_MIME_TYPE => {
223
- return Err(KreuzbergError::UnsupportedFormat(
224
- "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
225
- ));
226
- }
227
- _ => {}
228
- }
229
-
230
- extract_file_with_extractor(path, &detected_mime, config).await
231
- }
232
- .await;
233
-
234
- #[cfg(feature = "otel")]
235
- if let Err(ref e) = result {
236
- record_error(e);
237
- }
238
-
239
- result
240
- }
241
-
242
- /// Extract content from a byte array.
243
- #[cfg_attr(feature = "otel", tracing::instrument(
244
- skip(config, content),
245
- fields(
246
- extraction.mime_type = mime_type,
247
- extraction.size_bytes = content.len(),
248
- )
249
- ))]
250
- pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
251
- use crate::core::mime;
252
-
253
- let result = async {
254
- let validated_mime = mime::validate_mime_type(mime_type)?;
255
-
256
- match validated_mime.as_str() {
257
- #[cfg(feature = "office")]
258
- LEGACY_WORD_MIME_TYPE => {
259
- let conversion = convert_doc_to_docx(content).await?;
260
- let mut result =
261
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
262
- apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
263
- return Ok(result);
264
- }
265
- #[cfg(not(feature = "office"))]
266
- LEGACY_WORD_MIME_TYPE => {
267
- return Err(KreuzbergError::UnsupportedFormat(
268
- "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
269
- ));
270
- }
271
- #[cfg(feature = "office")]
272
- LEGACY_POWERPOINT_MIME_TYPE => {
273
- let conversion = convert_ppt_to_pptx(content).await?;
274
- let mut result =
275
- extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
276
- apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
277
- return Ok(result);
278
- }
279
- #[cfg(not(feature = "office"))]
280
- LEGACY_POWERPOINT_MIME_TYPE => {
281
- return Err(KreuzbergError::UnsupportedFormat(
282
- "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
283
- ));
284
- }
285
- _ => {}
286
- }
287
-
288
- extract_bytes_with_extractor(content, &validated_mime, config).await
289
- }
290
- .await;
291
-
292
- #[cfg(feature = "otel")]
293
- if let Err(ref e) = result {
294
- record_error(e);
295
- }
296
-
297
- result
298
- }
299
-
300
- /// Extract content from multiple files concurrently.
301
- ///
302
- /// This function processes multiple files in parallel, automatically managing
303
- /// concurrency to prevent resource exhaustion. The concurrency limit can be
304
- /// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
305
- /// to `num_cpus * 2`.
306
- ///
307
- /// # Arguments
308
- ///
309
- /// * `paths` - Vector of file paths to extract
310
- /// * `config` - Extraction configuration
311
- ///
312
- /// # Returns
313
- ///
314
- /// A vector of `ExtractionResult` in the same order as the input paths.
315
- ///
316
- /// # Errors
317
- ///
318
- /// Individual file errors are captured in the result metadata. System errors
319
- /// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
320
- #[cfg(feature = "tokio-runtime")]
321
- #[cfg_attr(feature = "otel", tracing::instrument(
322
- skip(config, paths),
323
- fields(
324
- extraction.batch_size = paths.len(),
325
- )
326
- ))]
327
- pub async fn batch_extract_file(
328
- paths: Vec<impl AsRef<Path>>,
329
- config: &ExtractionConfig,
330
- ) -> Result<Vec<ExtractionResult>> {
331
- use std::sync::Arc;
332
- use tokio::sync::Semaphore;
333
- use tokio::task::JoinSet;
334
-
335
- if paths.is_empty() {
336
- return Ok(vec![]);
337
- }
338
-
339
- let config = Arc::new(config.clone());
340
-
341
- let max_concurrent = config.max_concurrent_extractions.unwrap_or_else(|| num_cpus::get() * 2);
342
- let semaphore = Arc::new(Semaphore::new(max_concurrent));
343
-
344
- let mut tasks = JoinSet::new();
345
-
346
- for (index, path) in paths.into_iter().enumerate() {
347
- let path_buf = path.as_ref().to_path_buf();
348
- let config_clone = Arc::clone(&config);
349
- let semaphore_clone = Arc::clone(&semaphore);
350
-
351
- tasks.spawn(async move {
352
- let _permit = semaphore_clone.acquire().await.unwrap();
353
- let result =
354
- crate::core::batch_mode::with_batch_mode(async { extract_file(&path_buf, None, &config_clone).await })
355
- .await;
356
- (index, result)
357
- });
358
- }
359
-
360
- let mut results: Vec<Option<ExtractionResult>> = vec![None; tasks.len()];
361
-
362
- while let Some(task_result) = tasks.join_next().await {
363
- match task_result {
364
- Ok((index, Ok(result))) => {
365
- results[index] = Some(result);
366
- }
367
- Ok((index, Err(e))) => {
368
- // OSError/RuntimeError must bubble up - system errors need user reports ~keep
369
- if matches!(e, KreuzbergError::Io(_)) {
370
- return Err(e);
371
- }
372
-
373
- use crate::types::{ErrorMetadata, Metadata};
374
- let metadata = Metadata {
375
- error: Some(ErrorMetadata {
376
- error_type: format!("{:?}", e),
377
- message: e.to_string(),
378
- }),
379
- ..Default::default()
380
- };
381
-
382
- results[index] = Some(ExtractionResult {
383
- content: format!("Error: {}", e),
384
- mime_type: "text/plain".to_string(),
385
- metadata,
386
- tables: vec![],
387
- detected_languages: None,
388
- chunks: None,
389
- images: None,
390
- pages: None,
391
- });
392
- }
393
- Err(join_err) => {
394
- return Err(KreuzbergError::Other(format!("Task panicked: {}", join_err)));
395
- }
396
- }
397
- }
398
-
399
- #[allow(clippy::unwrap_used)]
400
- Ok(results.into_iter().map(|r| r.unwrap()).collect())
401
- }
402
-
403
- /// Extract content from multiple byte arrays concurrently.
404
- ///
405
- /// This function processes multiple byte arrays in parallel, automatically managing
406
- /// concurrency to prevent resource exhaustion. The concurrency limit can be
407
- /// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
408
- /// to `num_cpus * 2`.
409
- ///
410
- /// # Arguments
411
- ///
412
- /// * `contents` - Vector of (bytes, mime_type) tuples
413
- /// * `config` - Extraction configuration
414
- ///
415
- /// # Returns
416
- ///
417
- /// A vector of `ExtractionResult` in the same order as the input.
418
- #[cfg(feature = "tokio-runtime")]
419
- #[cfg_attr(feature = "otel", tracing::instrument(
420
- skip(config, contents),
421
- fields(
422
- extraction.batch_size = contents.len(),
423
- )
424
- ))]
425
- pub async fn batch_extract_bytes(
426
- contents: Vec<(&[u8], &str)>,
427
- config: &ExtractionConfig,
428
- ) -> Result<Vec<ExtractionResult>> {
429
- use std::sync::Arc;
430
- use tokio::sync::Semaphore;
431
- use tokio::task::JoinSet;
432
-
433
- if contents.is_empty() {
434
- return Ok(vec![]);
435
- }
436
-
437
- let batch_config = config.clone();
438
- let config = Arc::new(batch_config);
439
-
440
- let max_concurrent = config.max_concurrent_extractions.unwrap_or_else(|| num_cpus::get() * 2);
441
- let semaphore = Arc::new(Semaphore::new(max_concurrent));
442
-
443
- let owned_contents: Vec<(Vec<u8>, String)> = contents
444
- .into_iter()
445
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
446
- .collect();
447
-
448
- let mut tasks = JoinSet::new();
449
-
450
- for (index, (bytes, mime_type)) in owned_contents.into_iter().enumerate() {
451
- let config_clone = Arc::clone(&config);
452
- let semaphore_clone = Arc::clone(&semaphore);
453
-
454
- tasks.spawn(async move {
455
- let _permit = semaphore_clone.acquire().await.unwrap();
456
- let result = crate::core::batch_mode::with_batch_mode(async {
457
- extract_bytes(&bytes, &mime_type, &config_clone).await
458
- })
459
- .await;
460
- (index, result)
461
- });
462
- }
463
-
464
- let mut results: Vec<Option<ExtractionResult>> = vec![None; tasks.len()];
465
-
466
- while let Some(task_result) = tasks.join_next().await {
467
- match task_result {
468
- Ok((index, Ok(result))) => {
469
- results[index] = Some(result);
470
- }
471
- Ok((index, Err(e))) => {
472
- // OSError/RuntimeError must bubble up - system errors need user reports ~keep
473
- if matches!(e, KreuzbergError::Io(_)) {
474
- return Err(e);
475
- }
476
-
477
- use crate::types::{ErrorMetadata, Metadata};
478
- let metadata = Metadata {
479
- error: Some(ErrorMetadata {
480
- error_type: format!("{:?}", e),
481
- message: e.to_string(),
482
- }),
483
- ..Default::default()
484
- };
485
-
486
- results[index] = Some(ExtractionResult {
487
- content: format!("Error: {}", e),
488
- mime_type: "text/plain".to_string(),
489
- metadata,
490
- tables: vec![],
491
- detected_languages: None,
492
- chunks: None,
493
- images: None,
494
- pages: None,
495
- });
496
- }
497
- Err(join_err) => {
498
- return Err(KreuzbergError::Other(format!("Task panicked: {}", join_err)));
499
- }
500
- }
501
- }
502
-
503
- #[allow(clippy::unwrap_used)]
504
- Ok(results.into_iter().map(|r| r.unwrap()).collect())
505
- }
506
-
507
- /// Synchronous wrapper for `extract_file`.
508
- ///
509
- /// This is a convenience function that blocks the current thread until extraction completes.
510
- /// For async code, use `extract_file` directly.
511
- ///
512
- /// Uses the global Tokio runtime for 100x+ performance improvement over creating
513
- /// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
514
- ///
515
- /// This function is only available with the `tokio-runtime` feature. For WASM targets,
516
- /// use a truly synchronous extraction approach instead.
517
- #[cfg(feature = "tokio-runtime")]
518
- pub fn extract_file_sync(
519
- path: impl AsRef<Path>,
520
- mime_type: Option<&str>,
521
- config: &ExtractionConfig,
522
- ) -> Result<ExtractionResult> {
523
- GLOBAL_RUNTIME.block_on(extract_file(path, mime_type, config))
524
- }
525
-
526
- /// Synchronous wrapper for `extract_bytes`.
527
- ///
528
- /// Uses the global Tokio runtime for 100x+ performance improvement over creating
529
- /// a new runtime per call.
530
- ///
531
- /// With the `tokio-runtime` feature, this blocks the current thread using the global
532
- /// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
533
- #[cfg(feature = "tokio-runtime")]
534
- pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
535
- GLOBAL_RUNTIME.block_on(extract_bytes(content, mime_type, config))
536
- }
537
-
538
- /// Synchronous wrapper for `extract_bytes` (WASM-compatible version).
539
- ///
540
- /// This is a truly synchronous implementation without tokio runtime dependency.
541
- /// It calls `extract_bytes_sync_impl()` to perform the extraction.
542
- #[cfg(not(feature = "tokio-runtime"))]
543
- pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
544
- extract_bytes_sync_impl(content.to_vec(), Some(mime_type.to_string()), Some(config.clone()))
545
- }
546
-
547
- /// Synchronous wrapper for `batch_extract_file`.
548
- ///
549
- /// Uses the global Tokio runtime for 100x+ performance improvement over creating
550
- /// a new runtime per call.
551
- ///
552
- /// This function is only available with the `tokio-runtime` feature. For WASM targets,
553
- /// use a truly synchronous extraction approach instead.
554
- #[cfg(feature = "tokio-runtime")]
555
- pub fn batch_extract_file_sync(
556
- paths: Vec<impl AsRef<Path>>,
557
- config: &ExtractionConfig,
558
- ) -> Result<Vec<ExtractionResult>> {
559
- GLOBAL_RUNTIME.block_on(batch_extract_file(paths, config))
560
- }
561
-
562
- /// Synchronous wrapper for `batch_extract_bytes`.
563
- ///
564
- /// Uses the global Tokio runtime for 100x+ performance improvement over creating
565
- /// a new runtime per call.
566
- ///
567
- /// With the `tokio-runtime` feature, this blocks the current thread using the global
568
- /// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
569
- /// that iterates through items and calls `extract_bytes_sync()`.
570
- #[cfg(feature = "tokio-runtime")]
571
- pub fn batch_extract_bytes_sync(
572
- contents: Vec<(&[u8], &str)>,
573
- config: &ExtractionConfig,
574
- ) -> Result<Vec<ExtractionResult>> {
575
- GLOBAL_RUNTIME.block_on(batch_extract_bytes(contents, config))
576
- }
577
-
578
- /// Synchronous wrapper for `batch_extract_bytes` (WASM-compatible version).
579
- ///
580
- /// This is a truly synchronous implementation that iterates through items
581
- /// and calls `extract_bytes_sync()` for each.
582
- #[cfg(not(feature = "tokio-runtime"))]
583
- pub fn batch_extract_bytes_sync(
584
- contents: Vec<(&[u8], &str)>,
585
- config: &ExtractionConfig,
586
- ) -> Result<Vec<ExtractionResult>> {
587
- let mut results = Vec::with_capacity(contents.len());
588
- for (content, mime_type) in contents {
589
- let result = extract_bytes_sync(content, mime_type, config);
590
- results.push(result.unwrap_or_else(|e| {
591
- use crate::types::{ErrorMetadata, Metadata};
592
- ExtractionResult {
593
- content: format!("Error: {}", e),
594
- mime_type: "text/plain".to_string(),
595
- metadata: Metadata {
596
- error: Some(ErrorMetadata {
597
- error_type: format!("{:?}", e),
598
- message: e.to_string(),
599
- }),
600
- ..Default::default()
601
- },
602
- tables: vec![],
603
- detected_languages: None,
604
- chunks: None,
605
- images: None,
606
- pages: None,
607
- }
608
- }));
609
- }
610
- Ok(results)
611
- }
612
-
613
- /// Synchronous extraction implementation for WASM compatibility.
614
- ///
615
- /// This function performs extraction without requiring a tokio runtime.
616
- /// It calls the sync extractor methods directly.
617
- ///
618
- /// # Arguments
619
- ///
620
- /// * `content` - The byte content to extract
621
- /// * `mime_type` - Optional MIME type to validate/use
622
- /// * `config` - Optional extraction configuration
623
- ///
624
- /// # Returns
625
- ///
626
- /// An `ExtractionResult` or a `KreuzbergError`
627
- ///
628
- /// # Implementation Notes
629
- ///
630
- /// This is called when the `tokio-runtime` feature is disabled.
631
- /// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
632
- #[cfg(not(feature = "tokio-runtime"))]
633
- fn extract_bytes_sync_impl(
634
- content: Vec<u8>,
635
- mime_type: Option<String>,
636
- config: Option<ExtractionConfig>,
637
- ) -> Result<ExtractionResult> {
638
- use crate::core::mime;
639
-
640
- let config = config.unwrap_or_default();
641
-
642
- // Validate MIME type if provided
643
- let validated_mime = if let Some(mime) = mime_type {
644
- mime::validate_mime_type(&mime)?
645
- } else {
646
- return Err(KreuzbergError::Validation {
647
- message: "MIME type is required for synchronous extraction".to_string(),
648
- source: None,
649
- });
650
- };
651
-
652
- // Ensure extractors are initialized
653
- crate::extractors::ensure_initialized()?;
654
-
655
- // Get the appropriate extractor
656
- let extractor = get_extractor(&validated_mime)?;
657
-
658
- // Check if extractor supports synchronous extraction
659
- let sync_extractor = extractor.as_sync_extractor().ok_or_else(|| {
660
- KreuzbergError::UnsupportedFormat(format!(
661
- "Extractor for '{}' does not support synchronous extraction",
662
- validated_mime
663
- ))
664
- })?;
665
-
666
- // Call the sync extract method
667
- let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
668
-
669
- // Run post-processing pipeline (sync version)
670
- result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
671
-
672
- Ok(result)
673
- }
674
-
675
- async fn extract_file_with_extractor(
676
- path: &Path,
677
- mime_type: &str,
678
- config: &ExtractionConfig,
679
- ) -> Result<ExtractionResult> {
680
- crate::extractors::ensure_initialized()?;
681
-
682
- let extractor = get_extractor(mime_type)?;
683
- let mut result = extractor.extract_file(path, mime_type, config).await?;
684
- result = crate::core::pipeline::run_pipeline(result, config).await?;
685
- Ok(result)
686
- }
687
-
688
- async fn extract_bytes_with_extractor(
689
- content: &[u8],
690
- mime_type: &str,
691
- config: &ExtractionConfig,
692
- ) -> Result<ExtractionResult> {
693
- crate::extractors::ensure_initialized()?;
694
-
695
- let extractor = get_extractor(mime_type)?;
696
- let mut result = extractor.extract_bytes(content, mime_type, config).await?;
697
- result = crate::core::pipeline::run_pipeline(result, config).await?;
698
- Ok(result)
699
- }
700
-
701
- #[cfg(feature = "office")]
702
- fn apply_libreoffice_metadata(
703
- result: &mut ExtractionResult,
704
- legacy_mime: &str,
705
- conversion: &LibreOfficeConversionResult,
706
- ) {
707
- result.mime_type = legacy_mime.to_string();
708
- result.metadata.additional.insert(
709
- "libreoffice_conversion".to_string(),
710
- json!({
711
- "converter": "libreoffice",
712
- "original_format": conversion.original_format,
713
- "target_format": conversion.target_format,
714
- "target_mime": conversion.target_mime,
715
- }),
716
- );
717
- }
718
-
719
- #[cfg(test)]
720
- mod tests {
721
- use super::*;
722
- use serial_test::serial;
723
- use std::fs::File;
724
- use std::io::Write;
725
- use tempfile::tempdir;
726
-
727
- fn assert_text_content(actual: &str, expected: &str) {
728
- assert_eq!(actual.trim_end_matches('\n'), expected);
729
- }
730
-
731
- #[tokio::test]
732
- async fn test_extract_file_basic() {
733
- let dir = tempdir().unwrap();
734
- let file_path = dir.path().join("test.txt");
735
- let mut file = File::create(&file_path).unwrap();
736
- file.write_all(b"Hello, world!").unwrap();
737
-
738
- let config = ExtractionConfig::default();
739
- let result = extract_file(&file_path, None, &config).await;
740
-
741
- assert!(result.is_ok());
742
- let result = result.unwrap();
743
- assert_text_content(&result.content, "Hello, world!");
744
- assert_eq!(result.mime_type, "text/plain");
745
- }
746
-
747
- #[tokio::test]
748
- async fn test_extract_file_with_mime_override() {
749
- let dir = tempdir().unwrap();
750
- let file_path = dir.path().join("test.dat");
751
- let mut file = File::create(&file_path).unwrap();
752
- file.write_all(b"test content").unwrap();
753
-
754
- let config = ExtractionConfig::default();
755
- let result = extract_file(&file_path, Some("text/plain"), &config).await;
756
-
757
- assert!(result.is_ok());
758
- let result = result.unwrap();
759
- assert_eq!(result.mime_type, "text/plain");
760
- }
761
-
762
- #[tokio::test]
763
- async fn test_extract_file_nonexistent() {
764
- let config = ExtractionConfig::default();
765
- let result = extract_file("/nonexistent/file.txt", None, &config).await;
766
- assert!(result.is_err());
767
- }
768
-
769
- #[tokio::test]
770
- async fn test_extract_bytes_basic() {
771
- let config = ExtractionConfig::default();
772
- let result = extract_bytes(b"test content", "text/plain", &config).await;
773
-
774
- assert!(result.is_ok());
775
- let result = result.unwrap();
776
- assert_text_content(&result.content, "test content");
777
- assert_eq!(result.mime_type, "text/plain");
778
- }
779
-
780
- #[tokio::test]
781
- async fn test_extract_bytes_invalid_mime() {
782
- let config = ExtractionConfig::default();
783
- let result = extract_bytes(b"test", "invalid/mime", &config).await;
784
- assert!(result.is_err());
785
- }
786
-
787
- #[tokio::test]
788
- async fn test_batch_extract_file() {
789
- let dir = tempdir().unwrap();
790
-
791
- let file1 = dir.path().join("test1.txt");
792
- let file2 = dir.path().join("test2.txt");
793
-
794
- File::create(&file1).unwrap().write_all(b"content 1").unwrap();
795
- File::create(&file2).unwrap().write_all(b"content 2").unwrap();
796
-
797
- let config = ExtractionConfig::default();
798
- let paths = vec![file1, file2];
799
- let results = batch_extract_file(paths, &config).await;
800
-
801
- assert!(results.is_ok());
802
- let results = results.unwrap();
803
- assert_eq!(results.len(), 2);
804
- assert_text_content(&results[0].content, "content 1");
805
- assert_text_content(&results[1].content, "content 2");
806
- }
807
-
808
- #[tokio::test]
809
- async fn test_batch_extract_file_empty() {
810
- let config = ExtractionConfig::default();
811
- let paths: Vec<std::path::PathBuf> = vec![];
812
- let results = batch_extract_file(paths, &config).await;
813
-
814
- assert!(results.is_ok());
815
- assert_eq!(results.unwrap().len(), 0);
816
- }
817
-
818
- #[tokio::test]
819
- async fn test_batch_extract_bytes() {
820
- let config = ExtractionConfig::default();
821
- let contents = vec![
822
- (b"content 1".as_slice(), "text/plain"),
823
- (b"content 2".as_slice(), "text/plain"),
824
- ];
825
- let results = batch_extract_bytes(contents, &config).await;
826
-
827
- assert!(results.is_ok());
828
- let results = results.unwrap();
829
- assert_eq!(results.len(), 2);
830
- assert_text_content(&results[0].content, "content 1");
831
- assert_text_content(&results[1].content, "content 2");
832
- }
833
-
834
- #[test]
835
- fn test_sync_wrappers() {
836
- let dir = tempdir().unwrap();
837
- let file_path = dir.path().join("test.txt");
838
- File::create(&file_path).unwrap().write_all(b"sync test").unwrap();
839
-
840
- let config = ExtractionConfig::default();
841
-
842
- let result = extract_file_sync(&file_path, None, &config);
843
- assert!(result.is_ok());
844
- let result = result.unwrap();
845
- assert_text_content(&result.content, "sync test");
846
-
847
- let result = extract_bytes_sync(b"test", "text/plain", &config);
848
- assert!(result.is_ok());
849
- }
850
-
851
- #[tokio::test]
852
- async fn test_extractor_cache() {
853
- let config = ExtractionConfig::default();
854
-
855
- let result1 = extract_bytes(b"test 1", "text/plain", &config).await;
856
- assert!(result1.is_ok());
857
- let result1 = result1.unwrap();
858
-
859
- let result2 = extract_bytes(b"test 2", "text/plain", &config).await;
860
- assert!(result2.is_ok());
861
- let result2 = result2.unwrap();
862
-
863
- assert_text_content(&result1.content, "test 1");
864
- assert_text_content(&result2.content, "test 2");
865
-
866
- let result3 = extract_bytes(b"# test 3", "text/markdown", &config).await;
867
- assert!(result3.is_ok());
868
- }
869
-
870
- #[tokio::test]
871
- async fn test_extract_file_empty() {
872
- let dir = tempdir().unwrap();
873
- let file_path = dir.path().join("empty.txt");
874
- File::create(&file_path).unwrap();
875
-
876
- let config = ExtractionConfig::default();
877
- let result = extract_file(&file_path, None, &config).await;
878
-
879
- assert!(result.is_ok());
880
- let result = result.unwrap();
881
- assert_eq!(result.content, "");
882
- }
883
-
884
- #[tokio::test]
885
- async fn test_extract_bytes_empty() {
886
- let config = ExtractionConfig::default();
887
- let result = extract_bytes(b"", "text/plain", &config).await;
888
-
889
- assert!(result.is_ok());
890
- let result = result.unwrap();
891
- assert_eq!(result.content, "");
892
- }
893
-
894
- #[tokio::test]
895
- async fn test_extract_file_whitespace_only() {
896
- let dir = tempdir().unwrap();
897
- let file_path = dir.path().join("whitespace.txt");
898
- File::create(&file_path).unwrap().write_all(b" \n\t \n ").unwrap();
899
-
900
- let config = ExtractionConfig::default();
901
- let result = extract_file(&file_path, None, &config).await;
902
-
903
- assert!(result.is_ok());
904
- }
905
-
906
- #[tokio::test]
907
- async fn test_extract_file_very_long_path() {
908
- let dir = tempdir().unwrap();
909
- let long_name = "a".repeat(200);
910
- let file_path = dir.path().join(format!("{}.txt", long_name));
911
-
912
- if let Ok(mut f) = File::create(&file_path) {
913
- f.write_all(b"content").unwrap();
914
- let config = ExtractionConfig::default();
915
- let result = extract_file(&file_path, None, &config).await;
916
- assert!(result.is_ok() || result.is_err());
917
- }
918
- }
919
-
920
- #[tokio::test]
921
- async fn test_extract_file_special_characters_in_path() {
922
- let dir = tempdir().unwrap();
923
- let file_path = dir.path().join("test with spaces & symbols!.txt");
924
- File::create(&file_path).unwrap().write_all(b"content").unwrap();
925
-
926
- let config = ExtractionConfig::default();
927
- let result = extract_file(&file_path, None, &config).await;
928
-
929
- assert!(result.is_ok());
930
- let result = result.unwrap();
931
- assert_text_content(&result.content, "content");
932
- }
933
-
934
- #[tokio::test]
935
- async fn test_extract_file_unicode_filename() {
936
- let dir = tempdir().unwrap();
937
- let file_path = dir.path().join("测试文件名.txt");
938
- File::create(&file_path).unwrap().write_all(b"content").unwrap();
939
-
940
- let config = ExtractionConfig::default();
941
- let result = extract_file(&file_path, None, &config).await;
942
-
943
- assert!(result.is_ok());
944
- }
945
-
946
- #[tokio::test]
947
- async fn test_extract_bytes_unsupported_mime() {
948
- let config = ExtractionConfig::default();
949
- let result = extract_bytes(b"test", "application/x-unknown-format", &config).await;
950
-
951
- assert!(result.is_err());
952
- assert!(matches!(result.unwrap_err(), KreuzbergError::UnsupportedFormat(_)));
953
- }
954
-
955
- #[tokio::test]
956
- async fn test_batch_extract_file_with_errors() {
957
- let dir = tempdir().unwrap();
958
-
959
- let valid_file = dir.path().join("valid.txt");
960
- File::create(&valid_file).unwrap().write_all(b"valid content").unwrap();
961
-
962
- let invalid_file = dir.path().join("nonexistent.txt");
963
-
964
- let config = ExtractionConfig::default();
965
- let paths = vec![valid_file, invalid_file];
966
- let results = batch_extract_file(paths, &config).await;
967
-
968
- assert!(results.is_ok());
969
- let results = results.unwrap();
970
- assert_eq!(results.len(), 2);
971
- assert_text_content(&results[0].content, "valid content");
972
- assert!(results[1].metadata.error.is_some());
973
- }
974
-
975
- #[tokio::test]
976
- async fn test_batch_extract_bytes_mixed_valid_invalid() {
977
- let config = ExtractionConfig::default();
978
- let contents = vec![
979
- (b"valid 1".as_slice(), "text/plain"),
980
- (b"invalid".as_slice(), "invalid/mime"),
981
- (b"valid 2".as_slice(), "text/plain"),
982
- ];
983
- let results = batch_extract_bytes(contents, &config).await;
984
-
985
- assert!(results.is_ok());
986
- let results = results.unwrap();
987
- assert_eq!(results.len(), 3);
988
- assert_text_content(&results[0].content, "valid 1");
989
- assert!(results[1].metadata.error.is_some());
990
- assert_text_content(&results[2].content, "valid 2");
991
- }
992
-
993
- #[tokio::test]
994
- async fn test_batch_extract_bytes_all_invalid() {
995
- let config = ExtractionConfig::default();
996
- let contents = vec![
997
- (b"test 1".as_slice(), "invalid/mime1"),
998
- (b"test 2".as_slice(), "invalid/mime2"),
999
- ];
1000
- let results = batch_extract_bytes(contents, &config).await;
1001
-
1002
- assert!(results.is_ok());
1003
- let results = results.unwrap();
1004
- assert_eq!(results.len(), 2);
1005
- assert!(results[0].metadata.error.is_some());
1006
- assert!(results[1].metadata.error.is_some());
1007
- }
1008
-
1009
- #[tokio::test]
1010
- async fn test_extract_bytes_very_large() {
1011
- let large_content = vec![b'a'; 10_000_000];
1012
- let config = ExtractionConfig::default();
1013
- let result = extract_bytes(&large_content, "text/plain", &config).await;
1014
-
1015
- assert!(result.is_ok());
1016
- let result = result.unwrap();
1017
- let trimmed_len = result.content.trim_end_matches('\n').len();
1018
- assert_eq!(trimmed_len, 10_000_000);
1019
- }
1020
-
1021
- #[tokio::test]
1022
- async fn test_batch_extract_large_count() {
1023
- let dir = tempdir().unwrap();
1024
- let mut paths = Vec::new();
1025
-
1026
- for i in 0..100 {
1027
- let file_path = dir.path().join(format!("file{}.txt", i));
1028
- File::create(&file_path)
1029
- .unwrap()
1030
- .write_all(format!("content {}", i).as_bytes())
1031
- .unwrap();
1032
- paths.push(file_path);
1033
- }
1034
-
1035
- let config = ExtractionConfig::default();
1036
- let results = batch_extract_file(paths, &config).await;
1037
-
1038
- assert!(results.is_ok());
1039
- let results = results.unwrap();
1040
- assert_eq!(results.len(), 100);
1041
-
1042
- for (i, result) in results.iter().enumerate() {
1043
- assert_text_content(&result.content, &format!("content {}", i));
1044
- }
1045
- }
1046
-
1047
- #[tokio::test]
1048
- async fn test_extract_file_mime_detection_fallback() {
1049
- let dir = tempdir().unwrap();
1050
- let file_path = dir.path().join("testfile");
1051
- File::create(&file_path)
1052
- .unwrap()
1053
- .write_all(b"plain text content")
1054
- .unwrap();
1055
-
1056
- let config = ExtractionConfig::default();
1057
- let result = extract_file(&file_path, None, &config).await;
1058
-
1059
- assert!(result.is_ok() || result.is_err());
1060
- }
1061
-
1062
- #[tokio::test]
1063
- async fn test_extract_file_wrong_mime_override() {
1064
- let dir = tempdir().unwrap();
1065
- let file_path = dir.path().join("test.txt");
1066
- File::create(&file_path).unwrap().write_all(b"plain text").unwrap();
1067
-
1068
- let config = ExtractionConfig::default();
1069
- let result = extract_file(&file_path, Some("application/pdf"), &config).await;
1070
-
1071
- assert!(result.is_err() || result.is_ok());
1072
- }
1073
-
1074
- #[test]
1075
- fn test_sync_wrapper_nonexistent_file() {
1076
- let config = ExtractionConfig::default();
1077
- let result = extract_file_sync("/nonexistent/path.txt", None, &config);
1078
-
1079
- assert!(result.is_err());
1080
- assert!(matches!(result.unwrap_err(), KreuzbergError::Validation { .. }));
1081
- }
1082
-
1083
- #[test]
1084
- fn test_sync_wrapper_batch_empty() {
1085
- let config = ExtractionConfig::default();
1086
- let paths: Vec<std::path::PathBuf> = vec![];
1087
- let results = batch_extract_file_sync(paths, &config);
1088
-
1089
- assert!(results.is_ok());
1090
- assert_eq!(results.unwrap().len(), 0);
1091
- }
1092
-
1093
- #[test]
1094
- fn test_sync_wrapper_batch_bytes_empty() {
1095
- let config = ExtractionConfig::default();
1096
- let contents: Vec<(&[u8], &str)> = vec![];
1097
- let results = batch_extract_bytes_sync(contents, &config);
1098
-
1099
- assert!(results.is_ok());
1100
- assert_eq!(results.unwrap().len(), 0);
1101
- }
1102
-
1103
- #[tokio::test]
1104
- async fn test_concurrent_extractions_same_mime() {
1105
- use tokio::task::JoinSet;
1106
-
1107
- let config = Arc::new(ExtractionConfig::default());
1108
- let mut tasks = JoinSet::new();
1109
-
1110
- for i in 0..50 {
1111
- let config_clone = Arc::clone(&config);
1112
- tasks.spawn(async move {
1113
- let content = format!("test content {}", i);
1114
- extract_bytes(content.as_bytes(), "text/plain", &config_clone).await
1115
- });
1116
- }
1117
-
1118
- let mut success_count = 0;
1119
- while let Some(task_result) = tasks.join_next().await {
1120
- if let Ok(Ok(_)) = task_result {
1121
- success_count += 1;
1122
- }
1123
- }
1124
-
1125
- assert_eq!(success_count, 50);
1126
- }
1127
-
1128
- #[serial]
1129
- #[tokio::test]
1130
- async fn test_concurrent_extractions_different_mimes() {
1131
- use tokio::task::JoinSet;
1132
-
1133
- let config = Arc::new(ExtractionConfig::default());
1134
- let mut tasks = JoinSet::new();
1135
-
1136
- let mime_types = ["text/plain", "text/markdown"];
1137
-
1138
- for i in 0..30 {
1139
- let config_clone = Arc::clone(&config);
1140
- let mime = mime_types[i % mime_types.len()];
1141
- tasks.spawn(async move {
1142
- let content = format!("test {}", i);
1143
- extract_bytes(content.as_bytes(), mime, &config_clone).await
1144
- });
1145
- }
1146
-
1147
- let mut success_count = 0;
1148
- while let Some(task_result) = tasks.join_next().await {
1149
- if let Ok(Ok(_)) = task_result {
1150
- success_count += 1;
1151
- }
1152
- }
1153
-
1154
- assert_eq!(success_count, 30);
1155
- }
1156
- }
1
+ //! Main extraction entry points.
2
+ //!
3
+ //! This module provides the primary API for extracting content from files and byte arrays.
4
+ //! It orchestrates the entire extraction pipeline: cache checking, MIME detection,
5
+ //! extractor selection, extraction, post-processing, and cache storage.
6
+ //!
7
+ //! # Functions
8
+ //!
9
+ //! - [`extract_file`] - Extract content from a file path
10
+ //! - [`extract_bytes`] - Extract content from a byte array
11
+ //! - [`batch_extract_file`] - Extract content from multiple files concurrently
12
+ //! - [`batch_extract_bytes`] - Extract content from multiple byte arrays concurrently
13
+
14
+ use crate::core::config::ExtractionConfig;
15
+ use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
16
+ #[cfg(feature = "office")]
17
+ use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
18
+ use crate::plugins::DocumentExtractor;
19
+ use crate::types::ExtractionResult;
20
+ #[cfg(feature = "office")]
21
+ use crate::types::LibreOfficeConversionResult;
22
+ use crate::{KreuzbergError, Result};
23
+ #[cfg(feature = "tokio-runtime")]
24
+ use once_cell::sync::Lazy;
25
+ #[cfg(feature = "office")]
26
+ use serde_json::json;
27
+ use std::path::Path;
28
+ use std::sync::Arc;
29
+
30
+ /// Record error information in the current OpenTelemetry span.
31
+ ///
32
+ /// This function records error details in the current span when the `otel` feature is enabled.
33
+ /// It marks the span with `otel.status_code=ERROR` and adds error type and message fields.
34
+ ///
35
+ /// # Arguments
36
+ ///
37
+ /// * `error` - The error to record in the span
38
+ ///
39
+ /// # Example
40
+ ///
41
+ /// ```rust,ignore
42
+ /// let result = extract_file("doc.pdf", None, &config).await;
43
+ /// #[cfg(feature = "otel")]
44
+ /// if let Err(ref e) = result {
45
+ /// record_error(e);
46
+ /// }
47
+ /// result
48
+ /// ```
49
+ #[cfg(feature = "otel")]
50
+ fn record_error(error: &KreuzbergError) {
51
+ let span = tracing::Span::current();
52
+ span.record("otel.status_code", "ERROR");
53
+ span.record("error.type", format!("{:?}", error));
54
+ span.record("error.message", error.to_string());
55
+ }
56
+
57
+ /// Sanitize a file path to return only the filename.
58
+ ///
59
+ /// This function extracts the filename from a path to avoid recording
60
+ /// potentially sensitive full file paths in telemetry data.
61
+ ///
62
+ /// # Arguments
63
+ ///
64
+ /// * `path` - The path to sanitize
65
+ ///
66
+ /// # Returns
67
+ ///
68
+ /// The filename as a string, or "unknown" if extraction fails
69
+ ///
70
+ /// # Security
71
+ ///
72
+ /// This prevents PII (personally identifiable information) from appearing in
73
+ /// traces by only recording filenames instead of full paths.
74
+ ///
75
+ /// # Example
76
+ ///
77
+ /// ```rust,ignore
78
+ /// let path = Path::new("/home/user/documents/secret.pdf");
79
+ /// assert_eq!(sanitize_path(path), "secret.pdf");
80
+ /// ```
81
+ #[cfg(feature = "otel")]
82
+ fn sanitize_path(path: &Path) -> String {
83
+ path.file_name()
84
+ .and_then(|n| n.to_str())
85
+ .unwrap_or("unknown")
86
+ .to_string()
87
+ }
88
+
89
+ /// Global Tokio runtime for synchronous operations.
90
+ ///
91
+ /// This runtime is lazily initialized on first use and shared across all sync wrappers.
92
+ /// Using a global runtime instead of creating one per call provides 100x+ performance improvement.
93
+ ///
94
+ /// # Safety
95
+ ///
96
+ /// The `.expect()` here is justified because:
97
+ /// 1. Runtime creation can only fail due to system resource exhaustion (OOM, thread limit)
98
+ /// 2. If runtime creation fails, the process is already in a critical state
99
+ /// 3. This is a one-time initialization - if it fails, nothing will work
100
+ /// 4. Better to fail fast than return errors from every sync operation
101
+ ///
102
+ /// # Availability
103
+ ///
104
+ /// This static is only available when the `tokio-runtime` feature is enabled.
105
+ /// For WASM targets, use the truly synchronous extraction functions instead.
106
+ #[cfg(feature = "tokio-runtime")]
107
+ static GLOBAL_RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
108
+ tokio::runtime::Builder::new_multi_thread()
109
+ .enable_all()
110
+ .build()
111
+ .expect("Failed to create global Tokio runtime - system may be out of resources")
112
+ });
113
+
114
+ /// Get an extractor from the registry.
115
+ ///
116
+ /// This function acquires the registry read lock and retrieves the appropriate
117
+ /// extractor for the given MIME type.
118
+ ///
119
+ /// # Performance
120
+ ///
121
+ /// RwLock read + HashMap lookup is ~100ns, fast enough without caching.
122
+ /// Removed thread-local cache to avoid Tokio work-stealing scheduler issues.
123
+ fn get_extractor(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
124
+ let registry = crate::plugins::registry::get_document_extractor_registry();
125
+ let registry_read = registry
126
+ .read()
127
+ .map_err(|e| KreuzbergError::Other(format!("Document extractor registry lock poisoned: {}", e)))?;
128
+ registry_read.get(mime_type)
129
+ }
130
+
131
+ /// Extract content from a file.
132
+ ///
133
+ /// This is the main entry point for file-based extraction. It performs the following steps:
134
+ /// 1. Check cache for existing result (if caching enabled)
135
+ /// 2. Detect or validate MIME type
136
+ /// 3. Select appropriate extractor from registry
137
+ /// 4. Extract content
138
+ /// 5. Run post-processing pipeline
139
+ /// 6. Store result in cache (if caching enabled)
140
+ ///
141
+ /// # Arguments
142
+ ///
143
+ /// * `path` - Path to the file to extract
144
+ /// * `mime_type` - Optional MIME type override. If None, will be auto-detected
145
+ /// * `config` - Extraction configuration
146
+ ///
147
+ /// # Returns
148
+ ///
149
+ /// An `ExtractionResult` containing the extracted content and metadata.
150
+ ///
151
+ /// # Errors
152
+ ///
153
+ /// Returns `KreuzbergError::Validation` if the file doesn't exist or path is invalid.
154
+ /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
155
+ /// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
156
+ ///
157
+ /// # Example
158
+ ///
159
+ /// ```rust,no_run
160
+ /// use kreuzberg::core::extractor::extract_file;
161
+ /// use kreuzberg::core::config::ExtractionConfig;
162
+ ///
163
+ /// # async fn example() -> kreuzberg::Result<()> {
164
+ /// let config = ExtractionConfig::default();
165
+ /// let result = extract_file("document.pdf", None, &config).await?;
166
+ /// println!("Content: {}", result.content);
167
+ /// # Ok(())
168
+ /// # }
169
+ /// ```
170
+ #[cfg_attr(feature = "otel", tracing::instrument(
171
+ skip(config, path),
172
+ fields(
173
+ extraction.filename = tracing::field::Empty,
174
+ )
175
+ ))]
176
+ pub async fn extract_file(
177
+ path: impl AsRef<Path>,
178
+ mime_type: Option<&str>,
179
+ config: &ExtractionConfig,
180
+ ) -> Result<ExtractionResult> {
181
+ use crate::core::{io, mime};
182
+
183
+ let path = path.as_ref();
184
+
185
+ #[cfg(feature = "otel")]
186
+ {
187
+ let span = tracing::Span::current();
188
+ span.record("extraction.filename", sanitize_path(path));
189
+ }
190
+
191
+ let result = async {
192
+ io::validate_file_exists(path)?;
193
+
194
+ let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
195
+
196
+ match detected_mime.as_str() {
197
+ #[cfg(feature = "office")]
198
+ LEGACY_WORD_MIME_TYPE => {
199
+ let original_bytes = tokio::fs::read(path).await?;
200
+ let conversion = convert_doc_to_docx(&original_bytes).await?;
201
+ let mut result =
202
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
203
+ apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
204
+ return Ok(result);
205
+ }
206
+ #[cfg(not(feature = "office"))]
207
+ LEGACY_WORD_MIME_TYPE => {
208
+ return Err(KreuzbergError::UnsupportedFormat(
209
+ "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
210
+ ));
211
+ }
212
+ #[cfg(feature = "office")]
213
+ LEGACY_POWERPOINT_MIME_TYPE => {
214
+ let original_bytes = tokio::fs::read(path).await?;
215
+ let conversion = convert_ppt_to_pptx(&original_bytes).await?;
216
+ let mut result =
217
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
218
+ apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
219
+ return Ok(result);
220
+ }
221
+ #[cfg(not(feature = "office"))]
222
+ LEGACY_POWERPOINT_MIME_TYPE => {
223
+ return Err(KreuzbergError::UnsupportedFormat(
224
+ "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
225
+ ));
226
+ }
227
+ _ => {}
228
+ }
229
+
230
+ extract_file_with_extractor(path, &detected_mime, config).await
231
+ }
232
+ .await;
233
+
234
+ #[cfg(feature = "otel")]
235
+ if let Err(ref e) = result {
236
+ record_error(e);
237
+ }
238
+
239
+ result
240
+ }
241
+
242
+ /// Extract content from a byte array.
243
+ #[cfg_attr(feature = "otel", tracing::instrument(
244
+ skip(config, content),
245
+ fields(
246
+ extraction.mime_type = mime_type,
247
+ extraction.size_bytes = content.len(),
248
+ )
249
+ ))]
250
+ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
251
+ use crate::core::mime;
252
+
253
+ let result = async {
254
+ let validated_mime = mime::validate_mime_type(mime_type)?;
255
+
256
+ match validated_mime.as_str() {
257
+ #[cfg(feature = "office")]
258
+ LEGACY_WORD_MIME_TYPE => {
259
+ let conversion = convert_doc_to_docx(content).await?;
260
+ let mut result =
261
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
262
+ apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
263
+ return Ok(result);
264
+ }
265
+ #[cfg(not(feature = "office"))]
266
+ LEGACY_WORD_MIME_TYPE => {
267
+ return Err(KreuzbergError::UnsupportedFormat(
268
+ "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
269
+ ));
270
+ }
271
+ #[cfg(feature = "office")]
272
+ LEGACY_POWERPOINT_MIME_TYPE => {
273
+ let conversion = convert_ppt_to_pptx(content).await?;
274
+ let mut result =
275
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
276
+ apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
277
+ return Ok(result);
278
+ }
279
+ #[cfg(not(feature = "office"))]
280
+ LEGACY_POWERPOINT_MIME_TYPE => {
281
+ return Err(KreuzbergError::UnsupportedFormat(
282
+ "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
283
+ ));
284
+ }
285
+ _ => {}
286
+ }
287
+
288
+ extract_bytes_with_extractor(content, &validated_mime, config).await
289
+ }
290
+ .await;
291
+
292
+ #[cfg(feature = "otel")]
293
+ if let Err(ref e) = result {
294
+ record_error(e);
295
+ }
296
+
297
+ result
298
+ }
299
+
300
+ /// Extract content from multiple files concurrently.
301
+ ///
302
+ /// This function processes multiple files in parallel, automatically managing
303
+ /// concurrency to prevent resource exhaustion. The concurrency limit can be
304
+ /// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
305
+ /// to `num_cpus * 2`.
306
+ ///
307
+ /// # Arguments
308
+ ///
309
+ /// * `paths` - Vector of file paths to extract
310
+ /// * `config` - Extraction configuration
311
+ ///
312
+ /// # Returns
313
+ ///
314
+ /// A vector of `ExtractionResult` in the same order as the input paths.
315
+ ///
316
+ /// # Errors
317
+ ///
318
+ /// Individual file errors are captured in the result metadata. System errors
319
+ /// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
320
+ #[cfg(feature = "tokio-runtime")]
321
+ #[cfg_attr(feature = "otel", tracing::instrument(
322
+ skip(config, paths),
323
+ fields(
324
+ extraction.batch_size = paths.len(),
325
+ )
326
+ ))]
327
+ pub async fn batch_extract_file(
328
+ paths: Vec<impl AsRef<Path>>,
329
+ config: &ExtractionConfig,
330
+ ) -> Result<Vec<ExtractionResult>> {
331
+ use std::sync::Arc;
332
+ use tokio::sync::Semaphore;
333
+ use tokio::task::JoinSet;
334
+
335
+ if paths.is_empty() {
336
+ return Ok(vec![]);
337
+ }
338
+
339
+ let config = Arc::new(config.clone());
340
+
341
+ let max_concurrent = config.max_concurrent_extractions.unwrap_or_else(|| num_cpus::get() * 2);
342
+ let semaphore = Arc::new(Semaphore::new(max_concurrent));
343
+
344
+ let mut tasks = JoinSet::new();
345
+
346
+ for (index, path) in paths.into_iter().enumerate() {
347
+ let path_buf = path.as_ref().to_path_buf();
348
+ let config_clone = Arc::clone(&config);
349
+ let semaphore_clone = Arc::clone(&semaphore);
350
+
351
+ tasks.spawn(async move {
352
+ let _permit = semaphore_clone.acquire().await.unwrap();
353
+ let result =
354
+ crate::core::batch_mode::with_batch_mode(async { extract_file(&path_buf, None, &config_clone).await })
355
+ .await;
356
+ (index, result)
357
+ });
358
+ }
359
+
360
+ let mut results: Vec<Option<ExtractionResult>> = vec![None; tasks.len()];
361
+
362
+ while let Some(task_result) = tasks.join_next().await {
363
+ match task_result {
364
+ Ok((index, Ok(result))) => {
365
+ results[index] = Some(result);
366
+ }
367
+ Ok((index, Err(e))) => {
368
+ // OSError/RuntimeError must bubble up - system errors need user reports ~keep
369
+ if matches!(e, KreuzbergError::Io(_)) {
370
+ return Err(e);
371
+ }
372
+
373
+ use crate::types::{ErrorMetadata, Metadata};
374
+ let metadata = Metadata {
375
+ error: Some(ErrorMetadata {
376
+ error_type: format!("{:?}", e),
377
+ message: e.to_string(),
378
+ }),
379
+ ..Default::default()
380
+ };
381
+
382
+ results[index] = Some(ExtractionResult {
383
+ content: format!("Error: {}", e),
384
+ mime_type: "text/plain".to_string(),
385
+ metadata,
386
+ tables: vec![],
387
+ detected_languages: None,
388
+ chunks: None,
389
+ images: None,
390
+ pages: None,
391
+ });
392
+ }
393
+ Err(join_err) => {
394
+ return Err(KreuzbergError::Other(format!("Task panicked: {}", join_err)));
395
+ }
396
+ }
397
+ }
398
+
399
+ #[allow(clippy::unwrap_used)]
400
+ Ok(results.into_iter().map(|r| r.unwrap()).collect())
401
+ }
402
+
403
+ /// Extract content from multiple byte arrays concurrently.
404
+ ///
405
+ /// This function processes multiple byte arrays in parallel, automatically managing
406
+ /// concurrency to prevent resource exhaustion. The concurrency limit can be
407
+ /// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
408
+ /// to `num_cpus * 2`.
409
+ ///
410
+ /// # Arguments
411
+ ///
412
+ /// * `contents` - Vector of (bytes, mime_type) tuples
413
+ /// * `config` - Extraction configuration
414
+ ///
415
+ /// # Returns
416
+ ///
417
+ /// A vector of `ExtractionResult` in the same order as the input.
418
+ #[cfg(feature = "tokio-runtime")]
419
+ #[cfg_attr(feature = "otel", tracing::instrument(
420
+ skip(config, contents),
421
+ fields(
422
+ extraction.batch_size = contents.len(),
423
+ )
424
+ ))]
425
+ pub async fn batch_extract_bytes(
426
+ contents: Vec<(&[u8], &str)>,
427
+ config: &ExtractionConfig,
428
+ ) -> Result<Vec<ExtractionResult>> {
429
+ use std::sync::Arc;
430
+ use tokio::sync::Semaphore;
431
+ use tokio::task::JoinSet;
432
+
433
+ if contents.is_empty() {
434
+ return Ok(vec![]);
435
+ }
436
+
437
+ let batch_config = config.clone();
438
+ let config = Arc::new(batch_config);
439
+
440
+ let max_concurrent = config.max_concurrent_extractions.unwrap_or_else(|| num_cpus::get() * 2);
441
+ let semaphore = Arc::new(Semaphore::new(max_concurrent));
442
+
443
+ let owned_contents: Vec<(Vec<u8>, String)> = contents
444
+ .into_iter()
445
+ .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
446
+ .collect();
447
+
448
+ let mut tasks = JoinSet::new();
449
+
450
+ for (index, (bytes, mime_type)) in owned_contents.into_iter().enumerate() {
451
+ let config_clone = Arc::clone(&config);
452
+ let semaphore_clone = Arc::clone(&semaphore);
453
+
454
+ tasks.spawn(async move {
455
+ let _permit = semaphore_clone.acquire().await.unwrap();
456
+ let result = crate::core::batch_mode::with_batch_mode(async {
457
+ extract_bytes(&bytes, &mime_type, &config_clone).await
458
+ })
459
+ .await;
460
+ (index, result)
461
+ });
462
+ }
463
+
464
+ let mut results: Vec<Option<ExtractionResult>> = vec![None; tasks.len()];
465
+
466
+ while let Some(task_result) = tasks.join_next().await {
467
+ match task_result {
468
+ Ok((index, Ok(result))) => {
469
+ results[index] = Some(result);
470
+ }
471
+ Ok((index, Err(e))) => {
472
+ // OSError/RuntimeError must bubble up - system errors need user reports ~keep
473
+ if matches!(e, KreuzbergError::Io(_)) {
474
+ return Err(e);
475
+ }
476
+
477
+ use crate::types::{ErrorMetadata, Metadata};
478
+ let metadata = Metadata {
479
+ error: Some(ErrorMetadata {
480
+ error_type: format!("{:?}", e),
481
+ message: e.to_string(),
482
+ }),
483
+ ..Default::default()
484
+ };
485
+
486
+ results[index] = Some(ExtractionResult {
487
+ content: format!("Error: {}", e),
488
+ mime_type: "text/plain".to_string(),
489
+ metadata,
490
+ tables: vec![],
491
+ detected_languages: None,
492
+ chunks: None,
493
+ images: None,
494
+ pages: None,
495
+ });
496
+ }
497
+ Err(join_err) => {
498
+ return Err(KreuzbergError::Other(format!("Task panicked: {}", join_err)));
499
+ }
500
+ }
501
+ }
502
+
503
+ #[allow(clippy::unwrap_used)]
504
+ Ok(results.into_iter().map(|r| r.unwrap()).collect())
505
+ }
506
+
507
+ /// Synchronous wrapper for `extract_file`.
508
+ ///
509
+ /// This is a convenience function that blocks the current thread until extraction completes.
510
+ /// For async code, use `extract_file` directly.
511
+ ///
512
+ /// Uses the global Tokio runtime for 100x+ performance improvement over creating
513
+ /// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
514
+ ///
515
+ /// This function is only available with the `tokio-runtime` feature. For WASM targets,
516
+ /// use a truly synchronous extraction approach instead.
517
+ #[cfg(feature = "tokio-runtime")]
518
+ pub fn extract_file_sync(
519
+ path: impl AsRef<Path>,
520
+ mime_type: Option<&str>,
521
+ config: &ExtractionConfig,
522
+ ) -> Result<ExtractionResult> {
523
+ GLOBAL_RUNTIME.block_on(extract_file(path, mime_type, config))
524
+ }
525
+
526
+ /// Synchronous wrapper for `extract_bytes`.
527
+ ///
528
+ /// Uses the global Tokio runtime for 100x+ performance improvement over creating
529
+ /// a new runtime per call.
530
+ ///
531
+ /// With the `tokio-runtime` feature, this blocks the current thread using the global
532
+ /// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
533
+ #[cfg(feature = "tokio-runtime")]
534
+ pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
535
+ GLOBAL_RUNTIME.block_on(extract_bytes(content, mime_type, config))
536
+ }
537
+
538
+ /// Synchronous wrapper for `extract_bytes` (WASM-compatible version).
539
+ ///
540
+ /// This is a truly synchronous implementation without tokio runtime dependency.
541
+ /// It calls `extract_bytes_sync_impl()` to perform the extraction.
542
+ #[cfg(not(feature = "tokio-runtime"))]
543
+ pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
544
+ extract_bytes_sync_impl(content.to_vec(), Some(mime_type.to_string()), Some(config.clone()))
545
+ }
546
+
547
+ /// Synchronous wrapper for `batch_extract_file`.
548
+ ///
549
+ /// Uses the global Tokio runtime for 100x+ performance improvement over creating
550
+ /// a new runtime per call.
551
+ ///
552
+ /// This function is only available with the `tokio-runtime` feature. For WASM targets,
553
+ /// use a truly synchronous extraction approach instead.
554
+ #[cfg(feature = "tokio-runtime")]
555
+ pub fn batch_extract_file_sync(
556
+ paths: Vec<impl AsRef<Path>>,
557
+ config: &ExtractionConfig,
558
+ ) -> Result<Vec<ExtractionResult>> {
559
+ GLOBAL_RUNTIME.block_on(batch_extract_file(paths, config))
560
+ }
561
+
562
+ /// Synchronous wrapper for `batch_extract_bytes`.
563
+ ///
564
+ /// Uses the global Tokio runtime for 100x+ performance improvement over creating
565
+ /// a new runtime per call.
566
+ ///
567
+ /// With the `tokio-runtime` feature, this blocks the current thread using the global
568
+ /// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
569
+ /// that iterates through items and calls `extract_bytes_sync()`.
570
+ #[cfg(feature = "tokio-runtime")]
571
+ pub fn batch_extract_bytes_sync(
572
+ contents: Vec<(&[u8], &str)>,
573
+ config: &ExtractionConfig,
574
+ ) -> Result<Vec<ExtractionResult>> {
575
+ GLOBAL_RUNTIME.block_on(batch_extract_bytes(contents, config))
576
+ }
577
+
578
+ /// Synchronous wrapper for `batch_extract_bytes` (WASM-compatible version).
579
+ ///
580
+ /// This is a truly synchronous implementation that iterates through items
581
+ /// and calls `extract_bytes_sync()` for each.
582
+ #[cfg(not(feature = "tokio-runtime"))]
583
+ pub fn batch_extract_bytes_sync(
584
+ contents: Vec<(&[u8], &str)>,
585
+ config: &ExtractionConfig,
586
+ ) -> Result<Vec<ExtractionResult>> {
587
+ let mut results = Vec::with_capacity(contents.len());
588
+ for (content, mime_type) in contents {
589
+ let result = extract_bytes_sync(content, mime_type, config);
590
+ results.push(result.unwrap_or_else(|e| {
591
+ use crate::types::{ErrorMetadata, Metadata};
592
+ ExtractionResult {
593
+ content: format!("Error: {}", e),
594
+ mime_type: "text/plain".to_string(),
595
+ metadata: Metadata {
596
+ error: Some(ErrorMetadata {
597
+ error_type: format!("{:?}", e),
598
+ message: e.to_string(),
599
+ }),
600
+ ..Default::default()
601
+ },
602
+ tables: vec![],
603
+ detected_languages: None,
604
+ chunks: None,
605
+ images: None,
606
+ pages: None,
607
+ }
608
+ }));
609
+ }
610
+ Ok(results)
611
+ }
612
+
613
+ /// Synchronous extraction implementation for WASM compatibility.
614
+ ///
615
+ /// This function performs extraction without requiring a tokio runtime.
616
+ /// It calls the sync extractor methods directly.
617
+ ///
618
+ /// # Arguments
619
+ ///
620
+ /// * `content` - The byte content to extract
621
+ /// * `mime_type` - Optional MIME type to validate/use
622
+ /// * `config` - Optional extraction configuration
623
+ ///
624
+ /// # Returns
625
+ ///
626
+ /// An `ExtractionResult` or a `KreuzbergError`
627
+ ///
628
+ /// # Implementation Notes
629
+ ///
630
+ /// This is called when the `tokio-runtime` feature is disabled.
631
+ /// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
632
+ #[cfg(not(feature = "tokio-runtime"))]
633
+ fn extract_bytes_sync_impl(
634
+ content: Vec<u8>,
635
+ mime_type: Option<String>,
636
+ config: Option<ExtractionConfig>,
637
+ ) -> Result<ExtractionResult> {
638
+ use crate::core::mime;
639
+
640
+ let config = config.unwrap_or_default();
641
+
642
+ // Validate MIME type if provided
643
+ let validated_mime = if let Some(mime) = mime_type {
644
+ mime::validate_mime_type(&mime)?
645
+ } else {
646
+ return Err(KreuzbergError::Validation {
647
+ message: "MIME type is required for synchronous extraction".to_string(),
648
+ source: None,
649
+ });
650
+ };
651
+
652
+ // Ensure extractors are initialized
653
+ crate::extractors::ensure_initialized()?;
654
+
655
+ // Get the appropriate extractor
656
+ let extractor = get_extractor(&validated_mime)?;
657
+
658
+ // Check if extractor supports synchronous extraction
659
+ let sync_extractor = extractor.as_sync_extractor().ok_or_else(|| {
660
+ KreuzbergError::UnsupportedFormat(format!(
661
+ "Extractor for '{}' does not support synchronous extraction",
662
+ validated_mime
663
+ ))
664
+ })?;
665
+
666
+ // Call the sync extract method
667
+ let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
668
+
669
+ // Run post-processing pipeline (sync version)
670
+ result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
671
+
672
+ Ok(result)
673
+ }
674
+
675
+ async fn extract_file_with_extractor(
676
+ path: &Path,
677
+ mime_type: &str,
678
+ config: &ExtractionConfig,
679
+ ) -> Result<ExtractionResult> {
680
+ crate::extractors::ensure_initialized()?;
681
+
682
+ let extractor = get_extractor(mime_type)?;
683
+ let mut result = extractor.extract_file(path, mime_type, config).await?;
684
+ result = crate::core::pipeline::run_pipeline(result, config).await?;
685
+ Ok(result)
686
+ }
687
+
688
+ async fn extract_bytes_with_extractor(
689
+ content: &[u8],
690
+ mime_type: &str,
691
+ config: &ExtractionConfig,
692
+ ) -> Result<ExtractionResult> {
693
+ crate::extractors::ensure_initialized()?;
694
+
695
+ let extractor = get_extractor(mime_type)?;
696
+ let mut result = extractor.extract_bytes(content, mime_type, config).await?;
697
+ result = crate::core::pipeline::run_pipeline(result, config).await?;
698
+ Ok(result)
699
+ }
700
+
701
+ #[cfg(feature = "office")]
702
+ fn apply_libreoffice_metadata(
703
+ result: &mut ExtractionResult,
704
+ legacy_mime: &str,
705
+ conversion: &LibreOfficeConversionResult,
706
+ ) {
707
+ result.mime_type = legacy_mime.to_string();
708
+ result.metadata.additional.insert(
709
+ "libreoffice_conversion".to_string(),
710
+ json!({
711
+ "converter": "libreoffice",
712
+ "original_format": conversion.original_format,
713
+ "target_format": conversion.target_format,
714
+ "target_mime": conversion.target_mime,
715
+ }),
716
+ );
717
+ }
718
+
719
+ #[cfg(test)]
720
+ mod tests {
721
+ use super::*;
722
+ use serial_test::serial;
723
+ use std::fs::File;
724
+ use std::io::Write;
725
+ use tempfile::tempdir;
726
+
727
+ fn assert_text_content(actual: &str, expected: &str) {
728
+ assert_eq!(actual.trim_end_matches('\n'), expected);
729
+ }
730
+
731
+ #[tokio::test]
732
+ async fn test_extract_file_basic() {
733
+ let dir = tempdir().unwrap();
734
+ let file_path = dir.path().join("test.txt");
735
+ let mut file = File::create(&file_path).unwrap();
736
+ file.write_all(b"Hello, world!").unwrap();
737
+
738
+ let config = ExtractionConfig::default();
739
+ let result = extract_file(&file_path, None, &config).await;
740
+
741
+ assert!(result.is_ok());
742
+ let result = result.unwrap();
743
+ assert_text_content(&result.content, "Hello, world!");
744
+ assert_eq!(result.mime_type, "text/plain");
745
+ }
746
+
747
+ #[tokio::test]
748
+ async fn test_extract_file_with_mime_override() {
749
+ let dir = tempdir().unwrap();
750
+ let file_path = dir.path().join("test.dat");
751
+ let mut file = File::create(&file_path).unwrap();
752
+ file.write_all(b"test content").unwrap();
753
+
754
+ let config = ExtractionConfig::default();
755
+ let result = extract_file(&file_path, Some("text/plain"), &config).await;
756
+
757
+ assert!(result.is_ok());
758
+ let result = result.unwrap();
759
+ assert_eq!(result.mime_type, "text/plain");
760
+ }
761
+
762
+ #[tokio::test]
763
+ async fn test_extract_file_nonexistent() {
764
+ let config = ExtractionConfig::default();
765
+ let result = extract_file("/nonexistent/file.txt", None, &config).await;
766
+ assert!(result.is_err());
767
+ }
768
+
769
+ #[tokio::test]
770
+ async fn test_extract_bytes_basic() {
771
+ let config = ExtractionConfig::default();
772
+ let result = extract_bytes(b"test content", "text/plain", &config).await;
773
+
774
+ assert!(result.is_ok());
775
+ let result = result.unwrap();
776
+ assert_text_content(&result.content, "test content");
777
+ assert_eq!(result.mime_type, "text/plain");
778
+ }
779
+
780
+ #[tokio::test]
781
+ async fn test_extract_bytes_invalid_mime() {
782
+ let config = ExtractionConfig::default();
783
+ let result = extract_bytes(b"test", "invalid/mime", &config).await;
784
+ assert!(result.is_err());
785
+ }
786
+
787
+ #[tokio::test]
788
+ async fn test_batch_extract_file() {
789
+ let dir = tempdir().unwrap();
790
+
791
+ let file1 = dir.path().join("test1.txt");
792
+ let file2 = dir.path().join("test2.txt");
793
+
794
+ File::create(&file1).unwrap().write_all(b"content 1").unwrap();
795
+ File::create(&file2).unwrap().write_all(b"content 2").unwrap();
796
+
797
+ let config = ExtractionConfig::default();
798
+ let paths = vec![file1, file2];
799
+ let results = batch_extract_file(paths, &config).await;
800
+
801
+ assert!(results.is_ok());
802
+ let results = results.unwrap();
803
+ assert_eq!(results.len(), 2);
804
+ assert_text_content(&results[0].content, "content 1");
805
+ assert_text_content(&results[1].content, "content 2");
806
+ }
807
+
808
+ #[tokio::test]
809
+ async fn test_batch_extract_file_empty() {
810
+ let config = ExtractionConfig::default();
811
+ let paths: Vec<std::path::PathBuf> = vec![];
812
+ let results = batch_extract_file(paths, &config).await;
813
+
814
+ assert!(results.is_ok());
815
+ assert_eq!(results.unwrap().len(), 0);
816
+ }
817
+
818
+ #[tokio::test]
819
+ async fn test_batch_extract_bytes() {
820
+ let config = ExtractionConfig::default();
821
+ let contents = vec![
822
+ (b"content 1".as_slice(), "text/plain"),
823
+ (b"content 2".as_slice(), "text/plain"),
824
+ ];
825
+ let results = batch_extract_bytes(contents, &config).await;
826
+
827
+ assert!(results.is_ok());
828
+ let results = results.unwrap();
829
+ assert_eq!(results.len(), 2);
830
+ assert_text_content(&results[0].content, "content 1");
831
+ assert_text_content(&results[1].content, "content 2");
832
+ }
833
+
834
+ #[test]
835
+ fn test_sync_wrappers() {
836
+ let dir = tempdir().unwrap();
837
+ let file_path = dir.path().join("test.txt");
838
+ File::create(&file_path).unwrap().write_all(b"sync test").unwrap();
839
+
840
+ let config = ExtractionConfig::default();
841
+
842
+ let result = extract_file_sync(&file_path, None, &config);
843
+ assert!(result.is_ok());
844
+ let result = result.unwrap();
845
+ assert_text_content(&result.content, "sync test");
846
+
847
+ let result = extract_bytes_sync(b"test", "text/plain", &config);
848
+ assert!(result.is_ok());
849
+ }
850
+
851
+ #[tokio::test]
852
+ async fn test_extractor_cache() {
853
+ let config = ExtractionConfig::default();
854
+
855
+ let result1 = extract_bytes(b"test 1", "text/plain", &config).await;
856
+ assert!(result1.is_ok());
857
+ let result1 = result1.unwrap();
858
+
859
+ let result2 = extract_bytes(b"test 2", "text/plain", &config).await;
860
+ assert!(result2.is_ok());
861
+ let result2 = result2.unwrap();
862
+
863
+ assert_text_content(&result1.content, "test 1");
864
+ assert_text_content(&result2.content, "test 2");
865
+
866
+ let result3 = extract_bytes(b"# test 3", "text/markdown", &config).await;
867
+ assert!(result3.is_ok());
868
+ }
869
+
870
+ #[tokio::test]
871
+ async fn test_extract_file_empty() {
872
+ let dir = tempdir().unwrap();
873
+ let file_path = dir.path().join("empty.txt");
874
+ File::create(&file_path).unwrap();
875
+
876
+ let config = ExtractionConfig::default();
877
+ let result = extract_file(&file_path, None, &config).await;
878
+
879
+ assert!(result.is_ok());
880
+ let result = result.unwrap();
881
+ assert_eq!(result.content, "");
882
+ }
883
+
884
+ #[tokio::test]
885
+ async fn test_extract_bytes_empty() {
886
+ let config = ExtractionConfig::default();
887
+ let result = extract_bytes(b"", "text/plain", &config).await;
888
+
889
+ assert!(result.is_ok());
890
+ let result = result.unwrap();
891
+ assert_eq!(result.content, "");
892
+ }
893
+
894
+ #[tokio::test]
895
+ async fn test_extract_file_whitespace_only() {
896
+ let dir = tempdir().unwrap();
897
+ let file_path = dir.path().join("whitespace.txt");
898
+ File::create(&file_path).unwrap().write_all(b" \n\t \n ").unwrap();
899
+
900
+ let config = ExtractionConfig::default();
901
+ let result = extract_file(&file_path, None, &config).await;
902
+
903
+ assert!(result.is_ok());
904
+ }
905
+
906
+ #[tokio::test]
907
+ async fn test_extract_file_very_long_path() {
908
+ let dir = tempdir().unwrap();
909
+ let long_name = "a".repeat(200);
910
+ let file_path = dir.path().join(format!("{}.txt", long_name));
911
+
912
+ if let Ok(mut f) = File::create(&file_path) {
913
+ f.write_all(b"content").unwrap();
914
+ let config = ExtractionConfig::default();
915
+ let result = extract_file(&file_path, None, &config).await;
916
+ assert!(result.is_ok() || result.is_err());
917
+ }
918
+ }
919
+
920
+ #[tokio::test]
921
+ async fn test_extract_file_special_characters_in_path() {
922
+ let dir = tempdir().unwrap();
923
+ let file_path = dir.path().join("test with spaces & symbols!.txt");
924
+ File::create(&file_path).unwrap().write_all(b"content").unwrap();
925
+
926
+ let config = ExtractionConfig::default();
927
+ let result = extract_file(&file_path, None, &config).await;
928
+
929
+ assert!(result.is_ok());
930
+ let result = result.unwrap();
931
+ assert_text_content(&result.content, "content");
932
+ }
933
+
934
+ #[tokio::test]
935
+ async fn test_extract_file_unicode_filename() {
936
+ let dir = tempdir().unwrap();
937
+ let file_path = dir.path().join("测试文件名.txt");
938
+ File::create(&file_path).unwrap().write_all(b"content").unwrap();
939
+
940
+ let config = ExtractionConfig::default();
941
+ let result = extract_file(&file_path, None, &config).await;
942
+
943
+ assert!(result.is_ok());
944
+ }
945
+
946
+ #[tokio::test]
947
+ async fn test_extract_bytes_unsupported_mime() {
948
+ let config = ExtractionConfig::default();
949
+ let result = extract_bytes(b"test", "application/x-unknown-format", &config).await;
950
+
951
+ assert!(result.is_err());
952
+ assert!(matches!(result.unwrap_err(), KreuzbergError::UnsupportedFormat(_)));
953
+ }
954
+
955
+ #[tokio::test]
956
+ async fn test_batch_extract_file_with_errors() {
957
+ let dir = tempdir().unwrap();
958
+
959
+ let valid_file = dir.path().join("valid.txt");
960
+ File::create(&valid_file).unwrap().write_all(b"valid content").unwrap();
961
+
962
+ let invalid_file = dir.path().join("nonexistent.txt");
963
+
964
+ let config = ExtractionConfig::default();
965
+ let paths = vec![valid_file, invalid_file];
966
+ let results = batch_extract_file(paths, &config).await;
967
+
968
+ assert!(results.is_ok());
969
+ let results = results.unwrap();
970
+ assert_eq!(results.len(), 2);
971
+ assert_text_content(&results[0].content, "valid content");
972
+ assert!(results[1].metadata.error.is_some());
973
+ }
974
+
975
+ #[tokio::test]
976
+ async fn test_batch_extract_bytes_mixed_valid_invalid() {
977
+ let config = ExtractionConfig::default();
978
+ let contents = vec![
979
+ (b"valid 1".as_slice(), "text/plain"),
980
+ (b"invalid".as_slice(), "invalid/mime"),
981
+ (b"valid 2".as_slice(), "text/plain"),
982
+ ];
983
+ let results = batch_extract_bytes(contents, &config).await;
984
+
985
+ assert!(results.is_ok());
986
+ let results = results.unwrap();
987
+ assert_eq!(results.len(), 3);
988
+ assert_text_content(&results[0].content, "valid 1");
989
+ assert!(results[1].metadata.error.is_some());
990
+ assert_text_content(&results[2].content, "valid 2");
991
+ }
992
+
993
+ #[tokio::test]
994
+ async fn test_batch_extract_bytes_all_invalid() {
995
+ let config = ExtractionConfig::default();
996
+ let contents = vec![
997
+ (b"test 1".as_slice(), "invalid/mime1"),
998
+ (b"test 2".as_slice(), "invalid/mime2"),
999
+ ];
1000
+ let results = batch_extract_bytes(contents, &config).await;
1001
+
1002
+ assert!(results.is_ok());
1003
+ let results = results.unwrap();
1004
+ assert_eq!(results.len(), 2);
1005
+ assert!(results[0].metadata.error.is_some());
1006
+ assert!(results[1].metadata.error.is_some());
1007
+ }
1008
+
1009
+ #[tokio::test]
1010
+ async fn test_extract_bytes_very_large() {
1011
+ let large_content = vec![b'a'; 10_000_000];
1012
+ let config = ExtractionConfig::default();
1013
+ let result = extract_bytes(&large_content, "text/plain", &config).await;
1014
+
1015
+ assert!(result.is_ok());
1016
+ let result = result.unwrap();
1017
+ let trimmed_len = result.content.trim_end_matches('\n').len();
1018
+ assert_eq!(trimmed_len, 10_000_000);
1019
+ }
1020
+
1021
+ #[tokio::test]
1022
+ async fn test_batch_extract_large_count() {
1023
+ let dir = tempdir().unwrap();
1024
+ let mut paths = Vec::new();
1025
+
1026
+ for i in 0..100 {
1027
+ let file_path = dir.path().join(format!("file{}.txt", i));
1028
+ File::create(&file_path)
1029
+ .unwrap()
1030
+ .write_all(format!("content {}", i).as_bytes())
1031
+ .unwrap();
1032
+ paths.push(file_path);
1033
+ }
1034
+
1035
+ let config = ExtractionConfig::default();
1036
+ let results = batch_extract_file(paths, &config).await;
1037
+
1038
+ assert!(results.is_ok());
1039
+ let results = results.unwrap();
1040
+ assert_eq!(results.len(), 100);
1041
+
1042
+ for (i, result) in results.iter().enumerate() {
1043
+ assert_text_content(&result.content, &format!("content {}", i));
1044
+ }
1045
+ }
1046
+
1047
+ #[tokio::test]
1048
+ async fn test_extract_file_mime_detection_fallback() {
1049
+ let dir = tempdir().unwrap();
1050
+ let file_path = dir.path().join("testfile");
1051
+ File::create(&file_path)
1052
+ .unwrap()
1053
+ .write_all(b"plain text content")
1054
+ .unwrap();
1055
+
1056
+ let config = ExtractionConfig::default();
1057
+ let result = extract_file(&file_path, None, &config).await;
1058
+
1059
+ assert!(result.is_ok() || result.is_err());
1060
+ }
1061
+
1062
+ #[tokio::test]
1063
+ async fn test_extract_file_wrong_mime_override() {
1064
+ let dir = tempdir().unwrap();
1065
+ let file_path = dir.path().join("test.txt");
1066
+ File::create(&file_path).unwrap().write_all(b"plain text").unwrap();
1067
+
1068
+ let config = ExtractionConfig::default();
1069
+ let result = extract_file(&file_path, Some("application/pdf"), &config).await;
1070
+
1071
+ assert!(result.is_err() || result.is_ok());
1072
+ }
1073
+
1074
+ #[test]
1075
+ fn test_sync_wrapper_nonexistent_file() {
1076
+ let config = ExtractionConfig::default();
1077
+ let result = extract_file_sync("/nonexistent/path.txt", None, &config);
1078
+
1079
+ assert!(result.is_err());
1080
+ assert!(matches!(result.unwrap_err(), KreuzbergError::Validation { .. }));
1081
+ }
1082
+
1083
+ #[test]
1084
+ fn test_sync_wrapper_batch_empty() {
1085
+ let config = ExtractionConfig::default();
1086
+ let paths: Vec<std::path::PathBuf> = vec![];
1087
+ let results = batch_extract_file_sync(paths, &config);
1088
+
1089
+ assert!(results.is_ok());
1090
+ assert_eq!(results.unwrap().len(), 0);
1091
+ }
1092
+
1093
+ #[test]
1094
+ fn test_sync_wrapper_batch_bytes_empty() {
1095
+ let config = ExtractionConfig::default();
1096
+ let contents: Vec<(&[u8], &str)> = vec![];
1097
+ let results = batch_extract_bytes_sync(contents, &config);
1098
+
1099
+ assert!(results.is_ok());
1100
+ assert_eq!(results.unwrap().len(), 0);
1101
+ }
1102
+
1103
+ #[tokio::test]
1104
+ async fn test_concurrent_extractions_same_mime() {
1105
+ use tokio::task::JoinSet;
1106
+
1107
+ let config = Arc::new(ExtractionConfig::default());
1108
+ let mut tasks = JoinSet::new();
1109
+
1110
+ for i in 0..50 {
1111
+ let config_clone = Arc::clone(&config);
1112
+ tasks.spawn(async move {
1113
+ let content = format!("test content {}", i);
1114
+ extract_bytes(content.as_bytes(), "text/plain", &config_clone).await
1115
+ });
1116
+ }
1117
+
1118
+ let mut success_count = 0;
1119
+ while let Some(task_result) = tasks.join_next().await {
1120
+ if let Ok(Ok(_)) = task_result {
1121
+ success_count += 1;
1122
+ }
1123
+ }
1124
+
1125
+ assert_eq!(success_count, 50);
1126
+ }
1127
+
1128
+ #[serial]
1129
+ #[tokio::test]
1130
+ async fn test_concurrent_extractions_different_mimes() {
1131
+ use tokio::task::JoinSet;
1132
+
1133
+ let config = Arc::new(ExtractionConfig::default());
1134
+ let mut tasks = JoinSet::new();
1135
+
1136
+ let mime_types = ["text/plain", "text/markdown"];
1137
+
1138
+ for i in 0..30 {
1139
+ let config_clone = Arc::clone(&config);
1140
+ let mime = mime_types[i % mime_types.len()];
1141
+ tasks.spawn(async move {
1142
+ let content = format!("test {}", i);
1143
+ extract_bytes(content.as_bytes(), mime, &config_clone).await
1144
+ });
1145
+ }
1146
+
1147
+ let mut success_count = 0;
1148
+ while let Some(task_result) = tasks.join_next().await {
1149
+ if let Ok(Ok(_)) = task_result {
1150
+ success_count += 1;
1151
+ }
1152
+ }
1153
+
1154
+ assert_eq!(success_count, 30);
1155
+ }
1156
+ }