kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -31,7 +31,6 @@
31
31
  //! # }
32
32
  //! ```
33
33
  use crate::error::{KreuzbergError, Result};
34
- use crate::text::utf8_validation;
35
34
  use serde::{Deserialize, Serialize};
36
35
  use std::collections::HashMap;
37
36
 
@@ -240,8 +239,8 @@ fn is_text_field(key: &str, custom_patterns: &[String]) -> bool {
240
239
  }
241
240
 
242
241
  pub fn parse_yaml(data: &[u8]) -> Result<StructuredDataResult> {
243
- let yaml_str = utf8_validation::from_utf8(data)
244
- .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in YAML: {}", e)))?;
242
+ let yaml_str =
243
+ std::str::from_utf8(data).map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in YAML: {}", e)))?;
245
244
 
246
245
  let value: serde_json::Value = serde_yaml_ng::from_str(yaml_str)
247
246
  .map_err(|e| KreuzbergError::parsing(format!("Failed to parse YAML: {}", e)))?;
@@ -312,8 +311,8 @@ fn extract_from_value(
312
311
  }
313
312
 
314
313
  pub fn parse_toml(data: &[u8]) -> Result<StructuredDataResult> {
315
- let toml_str = utf8_validation::from_utf8(data)
316
- .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in TOML: {}", e)))?;
314
+ let toml_str =
315
+ std::str::from_utf8(data).map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in TOML: {}", e)))?;
317
316
 
318
317
  let value: toml::Value =
319
318
  toml::from_str(toml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse TOML: {}", e)))?;
@@ -60,8 +60,7 @@ fn dataframe_to_markdown(df: &DataFrame) -> Result<String> {
60
60
  return Ok(String::new());
61
61
  }
62
62
 
63
- let estimated_capacity = df.height().saturating_mul(df.width()).saturating_mul(12).max(64);
64
- let mut markdown = String::with_capacity(estimated_capacity);
63
+ let mut markdown = String::new();
65
64
 
66
65
  markdown.push_str("| ");
67
66
  for col_name in df.get_column_names() {
@@ -26,7 +26,6 @@ use once_cell::sync::Lazy;
26
26
  use regex::Regex;
27
27
 
28
28
  use crate::error::Result;
29
- use crate::text::utf8_validation;
30
29
  use crate::types::TextExtractionResult;
31
30
 
32
31
  static MARKDOWN_HEADER: Lazy<Regex> =
@@ -39,25 +38,18 @@ static CODE_BLOCK_DELIMITER: Lazy<Regex> = Lazy::new(|| {
39
38
  });
40
39
 
41
40
  pub fn parse_text(text_bytes: &[u8], is_markdown: bool) -> Result<TextExtractionResult> {
42
- let text: std::borrow::Cow<'_, str> = match utf8_validation::from_utf8(text_bytes) {
43
- Ok(s) => std::borrow::Cow::Borrowed(s),
44
- Err(_) => std::borrow::Cow::Owned(String::from_utf8_lossy(text_bytes).into_owned()),
45
- };
41
+ let text = String::from_utf8_lossy(text_bytes).into_owned();
46
42
 
47
43
  let mut line_count = 0;
48
44
  let mut word_count = 0;
49
45
  let character_count = text.len();
50
46
 
51
- let estimated_headers_capacity = text.len().saturating_div(20).max(16);
52
- let estimated_links_capacity = text.lines().count().saturating_div(20).max(4);
53
- let estimated_code_blocks_capacity = 8;
54
-
55
- let mut headers = Vec::with_capacity(estimated_headers_capacity);
56
- let mut links = Vec::with_capacity(estimated_links_capacity);
57
- let mut code_blocks = Vec::with_capacity(estimated_code_blocks_capacity);
47
+ let mut headers = Vec::new();
48
+ let mut links = Vec::new();
49
+ let mut code_blocks = Vec::new();
58
50
  let mut in_code_block = false;
59
- let mut current_code_lang = String::with_capacity(16);
60
- let mut current_code = String::with_capacity(128);
51
+ let mut current_code_lang = String::new();
52
+ let mut current_code = String::new();
61
53
 
62
54
  for line in text.lines() {
63
55
  line_count += 1;
@@ -73,7 +65,7 @@ pub fn parse_text(text_bytes: &[u8], is_markdown: bool) -> Result<TextExtraction
73
65
  if current_code_lang.is_empty() {
74
66
  "plain".to_string()
75
67
  } else {
76
- std::mem::take(&mut current_code_lang)
68
+ current_code_lang.clone()
77
69
  },
78
70
  current_code.trim_end().to_string(),
79
71
  ));
@@ -102,14 +94,14 @@ pub fn parse_text(text_bytes: &[u8], is_markdown: bool) -> Result<TextExtraction
102
94
  }
103
95
 
104
96
  for caps in MARKDOWN_LINK.captures_iter(line) {
105
- if let (Some(text_match), Some(url)) = (caps.get(1), caps.get(2)) {
106
- links.push((text_match.as_str().to_string(), url.as_str().to_string()));
97
+ if let (Some(text), Some(url)) = (caps.get(1), caps.get(2)) {
98
+ links.push((text.as_str().to_string(), url.as_str().to_string()));
107
99
  }
108
100
  }
109
101
  }
110
102
 
111
103
  Ok(TextExtractionResult {
112
- content: text.into_owned(),
104
+ content: text,
113
105
  line_count,
114
106
  word_count,
115
107
  character_count,
@@ -77,7 +77,6 @@ fn build_archive_result(
77
77
  detected_languages: None,
78
78
  chunks: None,
79
79
  images: None,
80
- pages: None,
81
80
  }
82
81
  }
83
82
 
@@ -127,13 +126,6 @@ impl Plugin for ZipExtractor {
127
126
 
128
127
  #[async_trait]
129
128
  impl DocumentExtractor for ZipExtractor {
130
- #[cfg_attr(feature = "otel", tracing::instrument(
131
- skip(self, content, _config),
132
- fields(
133
- extractor.name = self.name(),
134
- content.size_bytes = content.len(),
135
- )
136
- ))]
137
129
  async fn extract_bytes(
138
130
  &self,
139
131
  content: &[u8],
@@ -205,13 +197,6 @@ impl Plugin for TarExtractor {
205
197
 
206
198
  #[async_trait]
207
199
  impl DocumentExtractor for TarExtractor {
208
- #[cfg_attr(feature = "otel", tracing::instrument(
209
- skip(self, content, _config),
210
- fields(
211
- extractor.name = self.name(),
212
- content.size_bytes = content.len(),
213
- )
214
- ))]
215
200
  async fn extract_bytes(
216
201
  &self,
217
202
  content: &[u8],
@@ -288,13 +273,6 @@ impl Plugin for SevenZExtractor {
288
273
 
289
274
  #[async_trait]
290
275
  impl DocumentExtractor for SevenZExtractor {
291
- #[cfg_attr(feature = "otel", tracing::instrument(
292
- skip(self, content, _config),
293
- fields(
294
- extractor.name = self.name(),
295
- content.size_bytes = content.len(),
296
- )
297
- ))]
298
276
  async fn extract_bytes(
299
277
  &self,
300
278
  content: &[u8],
@@ -1,14 +1,12 @@
1
- #![cfg(all(feature = "tokio-runtime", feature = "office"))]
2
-
3
1
  //! DOCX extractor using docx-lite for high-performance text extraction.
4
2
  //!
5
3
  //! Supports: Microsoft Word (.docx)
6
4
 
7
5
  use crate::Result;
8
6
  use crate::core::config::ExtractionConfig;
9
- use crate::extraction::{cells_to_markdown, office_metadata};
7
+ use crate::extraction::office_metadata;
10
8
  use crate::plugins::{DocumentExtractor, Plugin};
11
- use crate::types::{ExtractionResult, Metadata, PageBoundary, PageInfo, PageStructure, PageUnitType, Table};
9
+ use crate::types::{ExtractionResult, Metadata, Table};
12
10
  use async_trait::async_trait;
13
11
  use std::io::Cursor;
14
12
 
@@ -17,6 +15,7 @@ use std::io::Cursor;
17
15
  /// This extractor provides:
18
16
  /// - Fast text extraction via streaming XML parsing (~160 MB/s average)
19
17
  /// - Comprehensive metadata extraction (core.xml, app.xml, custom.xml)
18
+ /// - ~400x faster than Pandoc subprocess approach
20
19
  pub struct DocxExtractor;
21
20
 
22
21
  impl DocxExtractor {
@@ -67,6 +66,7 @@ impl Plugin for DocxExtractor {
67
66
  /// # Returns
68
67
  /// * `Table` - Converted table with cells and markdown representation
69
68
  fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize) -> Table {
69
+ // Extract cells as 2D vector
70
70
  let cells: Vec<Vec<String>> = docx_table
71
71
  .rows
72
72
  .iter()
@@ -74,6 +74,7 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
74
74
  row.cells
75
75
  .iter()
76
76
  .map(|cell| {
77
+ // Extract text from all paragraphs in the cell
77
78
  cell.paragraphs
78
79
  .iter()
79
80
  .map(|para| para.to_text())
@@ -86,12 +87,13 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
86
87
  })
87
88
  .collect();
88
89
 
90
+ // Generate markdown representation
89
91
  let markdown = cells_to_markdown(&cells);
90
92
 
91
93
  Table {
92
94
  cells,
93
95
  markdown,
94
- page_number: table_index + 1,
96
+ page_number: table_index + 1, // 1-indexed
95
97
  }
96
98
  }
97
99
 
@@ -102,55 +104,103 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
102
104
  ///
103
105
  /// # Returns
104
106
  /// * `String` - Markdown formatted table
107
+ fn cells_to_markdown(cells: &[Vec<String>]) -> String {
108
+ if cells.is_empty() {
109
+ return String::new();
110
+ }
111
+
112
+ let mut markdown = String::new();
113
+
114
+ // Determine number of columns from first row
115
+ let num_cols = cells.first().map(|r| r.len()).unwrap_or(0);
116
+ if num_cols == 0 {
117
+ return String::new();
118
+ }
119
+
120
+ // Header row (first row)
121
+ if let Some(header) = cells.first() {
122
+ markdown.push_str("| ");
123
+ for cell in header {
124
+ // Escape pipe characters in cell content
125
+ let escaped = cell.replace('|', "\\|");
126
+ markdown.push_str(&escaped);
127
+ markdown.push_str(" | ");
128
+ }
129
+ markdown.push('\n');
130
+
131
+ // Separator row
132
+ markdown.push('|');
133
+ for _ in 0..num_cols {
134
+ markdown.push_str("------|");
135
+ }
136
+ markdown.push('\n');
137
+ }
138
+
139
+ // Data rows (skip first row as it's the header)
140
+ for row in cells.iter().skip(1) {
141
+ markdown.push_str("| ");
142
+ for (idx, cell) in row.iter().enumerate() {
143
+ if idx >= num_cols {
144
+ break; // Handle irregular tables
145
+ }
146
+ // Escape pipe characters in cell content
147
+ let escaped = cell.replace('|', "\\|");
148
+ markdown.push_str(&escaped);
149
+ markdown.push_str(" | ");
150
+ }
151
+ // Pad with empty cells if row is shorter than expected
152
+ for _ in row.len()..num_cols {
153
+ markdown.push_str(" | ");
154
+ }
155
+ markdown.push('\n');
156
+ }
157
+
158
+ markdown
159
+ }
105
160
 
106
161
  #[async_trait]
107
162
  impl DocumentExtractor for DocxExtractor {
108
- #[cfg_attr(feature = "otel", tracing::instrument(
109
- skip(self, content, _config),
110
- fields(
111
- extractor.name = self.name(),
112
- content.size_bytes = content.len(),
113
- )
114
- ))]
115
163
  async fn extract_bytes(
116
164
  &self,
117
165
  content: &[u8],
118
166
  mime_type: &str,
119
167
  _config: &ExtractionConfig,
120
168
  ) -> Result<ExtractionResult> {
121
- let (text, tables, page_boundaries) = if crate::core::batch_mode::is_batch_mode() {
169
+ // Parse the DOCX document to extract both text and tables
170
+ let (text, tables) = if crate::core::batch_mode::is_batch_mode() {
171
+ // Batch mode: Use spawn_blocking for parallelism
122
172
  let content_owned = content.to_vec();
123
- let span = tracing::Span::current();
124
- tokio::task::spawn_blocking(
125
- move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
126
- let _guard = span.entered();
127
- let cursor = Cursor::new(&content_owned);
128
- let doc = docx_lite::parse_document(cursor)
129
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
130
-
131
- let text = doc.extract_text();
132
-
133
- let tables: Vec<Table> = doc
134
- .tables
135
- .iter()
136
- .enumerate()
137
- .map(|(idx, table)| convert_docx_table_to_table(table, idx))
138
- .collect();
139
-
140
- let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(&content_owned)?;
141
-
142
- Ok((text, tables, page_boundaries))
143
- },
144
- )
173
+ tokio::task::spawn_blocking(move || -> crate::error::Result<(String, Vec<Table>)> {
174
+ // Parse document structure
175
+ let cursor = Cursor::new(&content_owned);
176
+ let doc = docx_lite::parse_document(cursor)
177
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
178
+
179
+ // Extract text
180
+ let text = doc.extract_text();
181
+
182
+ // Extract tables
183
+ let tables: Vec<Table> = doc
184
+ .tables
185
+ .iter()
186
+ .enumerate()
187
+ .map(|(idx, table)| convert_docx_table_to_table(table, idx))
188
+ .collect();
189
+
190
+ Ok((text, tables))
191
+ })
145
192
  .await
146
193
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
147
194
  } else {
195
+ // Single-file mode: Direct extraction (no spawn overhead)
148
196
  let cursor = Cursor::new(content);
149
197
  let doc = docx_lite::parse_document(cursor)
150
198
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
151
199
 
200
+ // Extract text
152
201
  let text = doc.extract_text();
153
202
 
203
+ // Extract tables
154
204
  let tables: Vec<Table> = doc
155
205
  .tables
156
206
  .iter()
@@ -158,16 +208,14 @@ impl DocumentExtractor for DocxExtractor {
158
208
  .map(|(idx, table)| convert_docx_table_to_table(table, idx))
159
209
  .collect();
160
210
 
161
- let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
162
-
163
- (text, tables, page_boundaries)
211
+ (text, tables)
164
212
  };
165
213
 
214
+ // Extract metadata using existing office_metadata module
166
215
  let mut archive = if crate::core::batch_mode::is_batch_mode() {
216
+ // Batch mode: Use spawn_blocking for parallelism
167
217
  let content_owned = content.to_vec();
168
- let span = tracing::Span::current();
169
218
  tokio::task::spawn_blocking(move || -> crate::error::Result<_> {
170
- let _guard = span.entered();
171
219
  let cursor = Cursor::new(content_owned);
172
220
  zip::ZipArchive::new(cursor)
173
221
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))
@@ -175,6 +223,8 @@ impl DocumentExtractor for DocxExtractor {
175
223
  .await
176
224
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("Task join error: {}", e)))??
177
225
  } else {
226
+ // Single-file mode: Direct extraction (no spawn overhead)
227
+ // Note: We still need to clone for ZipArchive type consistency with batch mode
178
228
  let content_owned = content.to_vec();
179
229
  let cursor = Cursor::new(content_owned);
180
230
  zip::ZipArchive::new(cursor)
@@ -183,6 +233,7 @@ impl DocumentExtractor for DocxExtractor {
183
233
 
184
234
  let mut metadata_map = std::collections::HashMap::new();
185
235
 
236
+ // Extract core properties (title, creator, dates, keywords, etc.)
186
237
  if let Ok(core) = office_metadata::extract_core_properties(&mut archive) {
187
238
  if let Some(title) = core.title {
188
239
  metadata_map.insert("title".to_string(), serde_json::Value::String(title));
@@ -226,6 +277,7 @@ impl DocumentExtractor for DocxExtractor {
226
277
  }
227
278
  }
228
279
 
280
+ // Extract app properties (page count, word count, etc.)
229
281
  if let Ok(app) = office_metadata::extract_docx_app_properties(&mut archive) {
230
282
  if let Some(pages) = app.pages {
231
283
  metadata_map.insert("page_count".to_string(), serde_json::Value::Number(pages.into()));
@@ -262,48 +314,24 @@ impl DocumentExtractor for DocxExtractor {
262
314
  }
263
315
  }
264
316
 
317
+ // Extract custom properties
265
318
  if let Ok(custom) = office_metadata::extract_custom_properties(&mut archive) {
266
319
  for (key, value) in custom {
267
320
  metadata_map.insert(format!("custom_{}", key), value);
268
321
  }
269
322
  }
270
323
 
271
- let page_structure = if let Some(boundaries) = page_boundaries {
272
- let total_count = boundaries.len();
273
- Some(PageStructure {
274
- total_count,
275
- unit_type: PageUnitType::Page,
276
- boundaries: Some(boundaries),
277
- pages: Some(
278
- (1..=total_count)
279
- .map(|page_num| PageInfo {
280
- number: page_num,
281
- title: None,
282
- dimensions: None,
283
- image_count: None,
284
- table_count: None,
285
- hidden: None,
286
- })
287
- .collect(),
288
- ),
289
- })
290
- } else {
291
- None
292
- };
293
-
294
324
  Ok(ExtractionResult {
295
325
  content: text,
296
326
  mime_type: mime_type.to_string(),
297
327
  metadata: Metadata {
298
- pages: page_structure,
299
328
  additional: metadata_map,
300
329
  ..Default::default()
301
330
  },
302
- pages: None,
303
331
  tables,
304
332
  detected_languages: None,
305
333
  chunks: None,
306
- images: Some(vec![]),
334
+ images: None,
307
335
  })
308
336
  }
309
337
 
@@ -312,7 +340,7 @@ impl DocumentExtractor for DocxExtractor {
312
340
  }
313
341
 
314
342
  fn priority(&self) -> i32 {
315
- 50
343
+ 50 // Higher priority than Pandoc (40) to take precedence
316
344
  }
317
345
  }
318
346
 
@@ -352,12 +380,61 @@ mod tests {
352
380
  assert!(extractor.shutdown().is_ok());
353
381
  }
354
382
 
383
+ #[test]
384
+ fn test_cells_to_markdown_basic_table() {
385
+ let cells = vec![
386
+ vec!["Header1".to_string(), "Header2".to_string()],
387
+ vec!["Row1Col1".to_string(), "Row1Col2".to_string()],
388
+ vec!["Row2Col1".to_string(), "Row2Col2".to_string()],
389
+ ];
390
+
391
+ let markdown = cells_to_markdown(&cells);
392
+
393
+ assert!(markdown.contains("| Header1 | Header2 |"));
394
+ assert!(markdown.contains("|------|------|"));
395
+ assert!(markdown.contains("| Row1Col1 | Row1Col2 |"));
396
+ assert!(markdown.contains("| Row2Col1 | Row2Col2 |"));
397
+ }
398
+
399
+ #[test]
400
+ fn test_cells_to_markdown_empty() {
401
+ let cells: Vec<Vec<String>> = vec![];
402
+ let markdown = cells_to_markdown(&cells);
403
+ assert_eq!(markdown, "");
404
+ }
405
+
406
+ #[test]
407
+ fn test_cells_to_markdown_escape_pipes() {
408
+ let cells = vec![vec!["Header".to_string()], vec!["Cell with | pipe".to_string()]];
409
+
410
+ let markdown = cells_to_markdown(&cells);
411
+ assert!(markdown.contains("Cell with \\| pipe"));
412
+ }
413
+
414
+ #[test]
415
+ fn test_cells_to_markdown_irregular_rows() {
416
+ let cells = vec![
417
+ vec!["H1".to_string(), "H2".to_string(), "H3".to_string()],
418
+ vec!["R1C1".to_string(), "R1C2".to_string()], // Missing third column
419
+ vec!["R2C1".to_string(), "R2C2".to_string(), "R2C3".to_string()],
420
+ ];
421
+
422
+ let markdown = cells_to_markdown(&cells);
423
+
424
+ // Should have 3 columns in header
425
+ assert!(markdown.contains("| H1 | H2 | H3 |"));
426
+ // Should pad short rows
427
+ assert!(markdown.contains("| R1C1 | R1C2 | |"));
428
+ }
429
+
355
430
  #[test]
356
431
  fn test_convert_docx_table_to_table() {
357
432
  use docx_lite::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
358
433
 
434
+ // Create a simple docx-lite table
359
435
  let mut table = DocxTable::new();
360
436
 
437
+ // Header row
361
438
  let mut header_row = TableRow::default();
362
439
  let mut cell1 = TableCell::default();
363
440
  let mut para1 = Paragraph::new();
@@ -373,6 +450,7 @@ mod tests {
373
450
 
374
451
  table.rows.push(header_row);
375
452
 
453
+ // Data row
376
454
  let mut data_row = TableRow::default();
377
455
  let mut cell3 = TableCell::default();
378
456
  let mut para3 = Paragraph::new();
@@ -388,10 +466,11 @@ mod tests {
388
466
 
389
467
  table.rows.push(data_row);
390
468
 
469
+ // Convert to Kreuzberg Table
391
470
  let result = convert_docx_table_to_table(&table, 0);
392
471
 
393
- assert_eq!(result.page_number, 1);
394
- assert_eq!(result.cells.len(), 2);
472
+ assert_eq!(result.page_number, 1); // 0 + 1 = 1 (1-indexed)
473
+ assert_eq!(result.cells.len(), 2); // 2 rows
395
474
  assert_eq!(result.cells[0], vec!["Name", "Age"]);
396
475
  assert_eq!(result.cells[1], vec!["Alice", "30"]);
397
476
  assert!(result.markdown.contains("| Name | Age |"));
@@ -2,11 +2,9 @@
2
2
 
3
3
  use crate::Result;
4
4
  use crate::core::config::ExtractionConfig;
5
- use crate::extractors::SyncExtractor;
6
5
  use crate::plugins::{DocumentExtractor, Plugin};
7
6
  use crate::types::{EmailMetadata, ExtractionResult, Metadata};
8
7
  use async_trait::async_trait;
9
- #[cfg(feature = "tokio-runtime")]
10
8
  use std::path::Path;
11
9
 
12
10
  /// Email message extractor.
@@ -44,8 +42,14 @@ impl Plugin for EmailExtractor {
44
42
  }
45
43
  }
46
44
 
47
- impl SyncExtractor for EmailExtractor {
48
- fn extract_sync(&self, content: &[u8], mime_type: &str, _config: &ExtractionConfig) -> Result<ExtractionResult> {
45
+ #[async_trait]
46
+ impl DocumentExtractor for EmailExtractor {
47
+ async fn extract_bytes(
48
+ &self,
49
+ content: &[u8],
50
+ mime_type: &str,
51
+ _config: &ExtractionConfig,
52
+ ) -> Result<ExtractionResult> {
49
53
  let email_result = crate::extraction::email::extract_email_content(content, mime_type)?;
50
54
 
51
55
  let text = crate::extraction::email::build_email_text_output(&email_result);
@@ -77,7 +81,7 @@ impl SyncExtractor for EmailExtractor {
77
81
  metadata: Metadata {
78
82
  format: Some(crate::types::FormatMetadata::Email(email_metadata)),
79
83
  subject: email_result.subject.clone(),
80
- created_at: email_result.date.clone(),
84
+ date: email_result.date.clone(),
81
85
  additional,
82
86
  ..Default::default()
83
87
  },
@@ -85,37 +89,9 @@ impl SyncExtractor for EmailExtractor {
85
89
  detected_languages: None,
86
90
  chunks: None,
87
91
  images: None,
88
- pages: None,
89
92
  })
90
93
  }
91
- }
92
94
 
93
- #[async_trait]
94
- impl DocumentExtractor for EmailExtractor {
95
- #[cfg_attr(feature = "otel", tracing::instrument(
96
- skip(self, content, config),
97
- fields(
98
- extractor.name = self.name(),
99
- content.size_bytes = content.len(),
100
- )
101
- ))]
102
- async fn extract_bytes(
103
- &self,
104
- content: &[u8],
105
- mime_type: &str,
106
- config: &ExtractionConfig,
107
- ) -> Result<ExtractionResult> {
108
- self.extract_sync(content, mime_type, config)
109
- }
110
-
111
- #[cfg(feature = "tokio-runtime")]
112
- #[cfg_attr(feature = "otel", tracing::instrument(
113
- skip(self, path, config),
114
- fields(
115
- extractor.name = self.name(),
116
- )
117
- ))]
118
- #[cfg(feature = "tokio-runtime")]
119
95
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
120
96
  let bytes = tokio::fs::read(path).await?;
121
97
  self.extract_bytes(&bytes, mime_type, config).await
@@ -128,10 +104,6 @@ impl DocumentExtractor for EmailExtractor {
128
104
  fn priority(&self) -> i32 {
129
105
  50
130
106
  }
131
-
132
- fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
133
- Some(self)
134
- }
135
107
  }
136
108
 
137
109
  #[cfg(test)]