kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,7 +1,6 @@
1
1
  //! Image extraction functionality.
2
2
  //!
3
- //! This module provides functions for extracting metadata and EXIF data from images,
4
- //! including support for multi-frame TIFF files.
3
+ //! This module provides functions for extracting metadata and EXIF data from images.
5
4
 
6
5
  use crate::error::{KreuzbergError, Result};
7
6
  use exif::{In, Reader, Tag};
@@ -40,7 +39,7 @@ pub fn extract_image_metadata(bytes: &[u8]) -> Result<ImageMetadata> {
40
39
 
41
40
  let width = image.width();
42
41
  let height = image.height();
43
- let format_str = format!("{:?}", format).to_uppercase();
42
+ let format_str = format!("{:?}", format);
44
43
 
45
44
  let exif_data = extract_exif_data(bytes);
46
45
 
@@ -95,129 +94,6 @@ fn extract_exif_data(bytes: &[u8]) -> HashMap<String, String> {
95
94
  exif_map
96
95
  }
97
96
 
98
- /// Result of OCR extraction from an image with optional page tracking.
99
- #[derive(Debug, Clone)]
100
- pub struct ImageOcrResult {
101
- /// Extracted text content
102
- pub content: String,
103
- /// Character byte boundaries per frame (for multi-frame TIFFs)
104
- pub boundaries: Option<Vec<crate::types::PageBoundary>>,
105
- /// Per-frame content information
106
- pub page_contents: Option<Vec<crate::types::PageContent>>,
107
- }
108
-
109
- /// Detects the number of frames in a TIFF file.
110
- ///
111
- /// Returns the count of image frames/pages in a TIFF. Single-frame TIFFs return 1.
112
- /// Invalid or non-TIFF data returns an error.
113
- ///
114
- /// # Arguments
115
- /// * `bytes` - Raw TIFF file bytes
116
- ///
117
- /// # Returns
118
- /// Frame count if valid TIFF, error otherwise.
119
- #[cfg(feature = "ocr")]
120
- fn detect_tiff_frame_count(bytes: &[u8]) -> Result<usize> {
121
- use tiff::decoder::Decoder;
122
- let mut decoder =
123
- Decoder::new(Cursor::new(bytes)).map_err(|e| KreuzbergError::parsing(format!("TIFF decode: {}", e)))?;
124
-
125
- let mut count = 1;
126
- while decoder.next_image().is_ok() {
127
- count += 1;
128
- }
129
- Ok(count)
130
- }
131
-
132
- /// Extract text from image bytes using OCR with optional page tracking for multi-frame TIFFs.
133
- ///
134
- /// This function:
135
- /// - Detects if the image is a multi-frame TIFF
136
- /// - For multi-frame TIFFs with PageConfig enabled, iterates frames and tracks boundaries
137
- /// - For single-frame images or when page tracking is disabled, runs OCR on the whole image
138
- /// - Returns (content, boundaries, page_contents) tuple
139
- ///
140
- /// # Arguments
141
- /// * `bytes` - Image file bytes
142
- /// * `mime_type` - MIME type (e.g., "image/tiff")
143
- /// * `ocr_result` - OCR backend result containing the text
144
- /// * `page_config` - Optional page configuration for boundary tracking
145
- ///
146
- /// # Returns
147
- /// ImageOcrResult with content and optional boundaries for pagination
148
- #[cfg(feature = "ocr")]
149
- pub fn extract_text_from_image_with_ocr(
150
- bytes: &[u8],
151
- mime_type: &str,
152
- ocr_result: String,
153
- page_config: Option<&crate::core::config::PageConfig>,
154
- ) -> Result<ImageOcrResult> {
155
- let is_tiff = mime_type.to_lowercase().contains("tiff");
156
- let should_track_pages = page_config.is_some() && is_tiff;
157
-
158
- if !should_track_pages {
159
- return Ok(ImageOcrResult {
160
- content: ocr_result,
161
- boundaries: None,
162
- page_contents: None,
163
- });
164
- }
165
-
166
- let frame_count = detect_tiff_frame_count(bytes)?;
167
-
168
- if frame_count <= 1 {
169
- return Ok(ImageOcrResult {
170
- content: ocr_result,
171
- boundaries: None,
172
- page_contents: None,
173
- });
174
- }
175
-
176
- let content_len = ocr_result.len();
177
- let content_per_frame = if frame_count > 0 {
178
- content_len / frame_count
179
- } else {
180
- content_len
181
- };
182
-
183
- let mut boundaries = Vec::new();
184
- let mut page_contents = Vec::new();
185
- let mut byte_offset = 0;
186
-
187
- for frame_num in 1..=frame_count {
188
- let frame_end = if frame_num == frame_count {
189
- content_len
190
- } else {
191
- let raw_end = (frame_num * content_per_frame).min(content_len);
192
- (raw_end..=content_len)
193
- .find(|&i| ocr_result.is_char_boundary(i))
194
- .unwrap_or(content_len)
195
- };
196
-
197
- boundaries.push(crate::types::PageBoundary {
198
- byte_start: byte_offset,
199
- byte_end: frame_end,
200
- page_number: frame_num,
201
- });
202
-
203
- page_contents.push(crate::types::PageContent {
204
- page_number: frame_num,
205
- content: ocr_result[byte_offset..frame_end].to_string(),
206
- tables: vec![],
207
- images: vec![],
208
- hierarchy: None,
209
- });
210
-
211
- byte_offset = frame_end;
212
- }
213
-
214
- Ok(ImageOcrResult {
215
- content: ocr_result,
216
- boundaries: Some(boundaries),
217
- page_contents: Some(page_contents),
218
- })
219
- }
220
-
221
97
  #[cfg(test)]
222
98
  mod tests {
223
99
  use super::*;
@@ -247,7 +123,7 @@ mod tests {
247
123
  let metadata = result.unwrap();
248
124
  assert_eq!(metadata.width, 100);
249
125
  assert_eq!(metadata.height, 80);
250
- assert_eq!(metadata.format, "PNG");
126
+ assert_eq!(metadata.format, "Png");
251
127
  }
252
128
 
253
129
  #[test]
@@ -259,7 +135,7 @@ mod tests {
259
135
  let metadata = result.unwrap();
260
136
  assert_eq!(metadata.width, 200);
261
137
  assert_eq!(metadata.height, 150);
262
- assert_eq!(metadata.format, "JPEG");
138
+ assert_eq!(metadata.format, "Jpeg");
263
139
  }
264
140
 
265
141
  #[test]
@@ -271,7 +147,7 @@ mod tests {
271
147
  let metadata = result.unwrap();
272
148
  assert_eq!(metadata.width, 120);
273
149
  assert_eq!(metadata.height, 90);
274
- assert_eq!(metadata.format, "WEBP");
150
+ assert_eq!(metadata.format, "WebP");
275
151
  }
276
152
 
277
153
  #[test]
@@ -283,7 +159,7 @@ mod tests {
283
159
  let metadata = result.unwrap();
284
160
  assert_eq!(metadata.width, 50);
285
161
  assert_eq!(metadata.height, 50);
286
- assert_eq!(metadata.format, "BMP");
162
+ assert_eq!(metadata.format, "Bmp");
287
163
  }
288
164
 
289
165
  #[test]
@@ -295,7 +171,7 @@ mod tests {
295
171
  let metadata = result.unwrap();
296
172
  assert_eq!(metadata.width, 180);
297
173
  assert_eq!(metadata.height, 120);
298
- assert_eq!(metadata.format, "TIFF");
174
+ assert_eq!(metadata.format, "Tiff");
299
175
  }
300
176
 
301
177
  #[test]
@@ -307,7 +183,7 @@ mod tests {
307
183
  let metadata = result.unwrap();
308
184
  assert_eq!(metadata.width, 64);
309
185
  assert_eq!(metadata.height, 64);
310
- assert_eq!(metadata.format, "GIF");
186
+ assert_eq!(metadata.format, "Gif");
311
187
  }
312
188
 
313
189
  #[test]
@@ -341,8 +217,8 @@ mod tests {
341
217
  let png_metadata = extract_image_metadata(&png_bytes).unwrap();
342
218
  let jpeg_metadata = extract_image_metadata(&jpeg_bytes).unwrap();
343
219
 
344
- assert_eq!(png_metadata.format, "PNG");
345
- assert_eq!(jpeg_metadata.format, "JPEG");
220
+ assert_eq!(png_metadata.format, "Png");
221
+ assert_eq!(jpeg_metadata.format, "Jpeg");
346
222
  }
347
223
 
348
224
  #[test]
@@ -408,7 +284,7 @@ mod tests {
408
284
  let metadata = result.unwrap();
409
285
  assert_eq!(metadata.width, 1);
410
286
  assert_eq!(metadata.height, 1);
411
- assert_eq!(metadata.format, "PNG");
287
+ assert_eq!(metadata.format, "Png");
412
288
  }
413
289
 
414
290
  #[test]
@@ -485,8 +361,8 @@ mod tests {
485
361
  let jpeg_meta = extract_image_metadata(&jpeg_bytes).unwrap();
486
362
  let webp_meta = extract_image_metadata(&webp_bytes).unwrap();
487
363
 
488
- assert_eq!(png_meta.format, "PNG");
489
- assert_eq!(jpeg_meta.format, "JPEG");
490
- assert_eq!(webp_meta.format, "WEBP");
364
+ assert_eq!(png_meta.format, "Png");
365
+ assert_eq!(jpeg_meta.format, "Jpeg");
366
+ assert_eq!(webp_meta.format, "WebP");
491
367
  }
492
368
  }
@@ -255,6 +255,7 @@ pub async fn convert_office_doc(
255
255
  let stderr = String::from_utf8_lossy(&output.stderr);
256
256
  let stdout = String::from_utf8_lossy(&output.stdout);
257
257
 
258
+ // Build detailed error message with both stdout and stderr
258
259
  let mut error_details = format!(
259
260
  "LibreOffice process failed with return code {}",
260
261
  output.status.code().unwrap_or(-1)
@@ -361,7 +362,6 @@ mod tests {
361
362
  use super::*;
362
363
 
363
364
  #[tokio::test]
364
- #[cfg(not(target_os = "windows"))]
365
365
  async fn test_check_libreoffice_available() {
366
366
  let result = check_libreoffice_available().await;
367
367
  if result.is_err() {
@@ -371,7 +371,6 @@ mod tests {
371
371
  }
372
372
 
373
373
  #[tokio::test]
374
- #[cfg(not(target_os = "windows"))]
375
374
  async fn test_convert_office_doc_missing_file() {
376
375
  if check_libreoffice_available().await.is_err() {
377
376
  return;
@@ -393,7 +392,6 @@ mod tests {
393
392
  }
394
393
 
395
394
  #[tokio::test]
396
- #[cfg(not(target_os = "windows"))]
397
395
  async fn test_convert_doc_to_docx_empty_bytes() {
398
396
  if check_libreoffice_available().await.is_err() {
399
397
  return;
@@ -406,7 +404,6 @@ mod tests {
406
404
  }
407
405
 
408
406
  #[tokio::test]
409
- #[cfg(not(target_os = "windows"))]
410
407
  async fn test_convert_ppt_to_pptx_empty_bytes() {
411
408
  if check_libreoffice_available().await.is_err() {
412
409
  return;
@@ -419,7 +416,6 @@ mod tests {
419
416
  }
420
417
 
421
418
  #[tokio::test]
422
- #[cfg(not(target_os = "windows"))]
423
419
  async fn test_convert_doc_to_docx_invalid_doc() {
424
420
  if check_libreoffice_available().await.is_err() {
425
421
  return;
@@ -432,7 +428,6 @@ mod tests {
432
428
  }
433
429
 
434
430
  #[tokio::test]
435
- #[cfg(not(target_os = "windows"))]
436
431
  async fn test_convert_ppt_to_pptx_invalid_ppt() {
437
432
  if check_libreoffice_available().await.is_err() {
438
433
  return;
@@ -445,7 +440,6 @@ mod tests {
445
440
  }
446
441
 
447
442
  #[tokio::test]
448
- #[cfg(not(target_os = "windows"))]
449
443
  async fn test_convert_office_doc_invalid_target_format() {
450
444
  if check_libreoffice_available().await.is_err() {
451
445
  return;
@@ -466,11 +460,11 @@ mod tests {
466
460
  }
467
461
 
468
462
  #[tokio::test]
469
- #[cfg(not(target_os = "windows"))]
470
463
  async fn test_check_libreoffice_missing_dependency_error() {
471
464
  let result = check_libreoffice_available().await;
472
465
 
473
- if let Err(err) = result {
466
+ if result.is_err() {
467
+ let err = result.unwrap_err();
474
468
  match err {
475
469
  KreuzbergError::MissingDependency(msg) => {
476
470
  assert!(msg.contains("LibreOffice") || msg.contains("soffice"));
@@ -481,7 +475,6 @@ mod tests {
481
475
  }
482
476
 
483
477
  #[tokio::test]
484
- #[cfg(not(target_os = "windows"))]
485
478
  async fn test_convert_office_doc_creates_output_dir() {
486
479
  if check_libreoffice_available().await.is_err() {
487
480
  return;
@@ -516,7 +509,6 @@ mod tests {
516
509
  }
517
510
 
518
511
  #[tokio::test]
519
- #[cfg(not(target_os = "windows"))]
520
512
  async fn test_convert_doc_to_docx_temp_cleanup() {
521
513
  if check_libreoffice_available().await.is_err() {
522
514
  return;
@@ -527,7 +519,6 @@ mod tests {
527
519
  }
528
520
 
529
521
  #[tokio::test]
530
- #[cfg(not(target_os = "windows"))]
531
522
  async fn test_convert_ppt_to_pptx_temp_cleanup() {
532
523
  if check_libreoffice_available().await.is_err() {
533
524
  return;
@@ -538,7 +529,6 @@ mod tests {
538
529
  }
539
530
 
540
531
  #[tokio::test]
541
- #[cfg(not(target_os = "windows"))]
542
532
  async fn test_convert_office_doc_timeout_kills_process() {
543
533
  if check_libreoffice_available().await.is_err() {
544
534
  return;
@@ -4,13 +4,6 @@ pub mod text;
4
4
  #[cfg(feature = "ocr")]
5
5
  pub mod image;
6
6
 
7
- /// Capacity estimation utilities for string pre-allocation.
8
- ///
9
- /// This module provides functions to estimate the capacity needed for string buffers
10
- /// based on input file sizes and content types. This enables pre-allocation, reducing
11
- /// reallocation cycles during string building operations.
12
- pub mod capacity;
13
-
14
7
  #[cfg(feature = "archives")]
15
8
  pub mod archive;
16
9
 
@@ -32,6 +25,9 @@ pub mod libreoffice;
32
25
  #[cfg(feature = "office")]
33
26
  pub mod office_metadata;
34
27
 
28
+ #[cfg(feature = "office")]
29
+ pub mod pandoc;
30
+
35
31
  #[cfg(feature = "office")]
36
32
  pub mod pptx;
37
33
 
@@ -41,9 +37,6 @@ pub mod table;
41
37
  #[cfg(feature = "xml")]
42
38
  pub mod xml;
43
39
 
44
- #[cfg(any(feature = "office", feature = "html"))]
45
- pub mod markdown;
46
-
47
40
  pub use structured::{JsonExtractionConfig, StructuredDataResult, parse_json, parse_toml, parse_yaml};
48
41
  pub use text::parse_text;
49
42
 
@@ -70,9 +63,8 @@ pub use libreoffice::{check_libreoffice_available, convert_doc_to_docx, convert_
70
63
 
71
64
  #[cfg(feature = "office")]
72
65
  pub use office_metadata::{
73
- CoreProperties, CustomProperties, DocxAppProperties, OdtProperties, PptxAppProperties, XlsxAppProperties,
74
- extract_core_properties, extract_custom_properties, extract_docx_app_properties, extract_odt_properties,
75
- extract_pptx_app_properties, extract_xlsx_app_properties,
66
+ CoreProperties, CustomProperties, DocxAppProperties, PptxAppProperties, XlsxAppProperties, extract_core_properties,
67
+ extract_custom_properties, extract_docx_app_properties, extract_pptx_app_properties, extract_xlsx_app_properties,
76
68
  };
77
69
 
78
70
  #[cfg(feature = "office")]
@@ -83,11 +75,3 @@ pub use table::table_from_arrow_to_markdown;
83
75
 
84
76
  #[cfg(feature = "xml")]
85
77
  pub use xml::parse_xml;
86
-
87
- #[cfg(any(feature = "office", feature = "html"))]
88
- pub use markdown::cells_to_markdown;
89
-
90
- pub use capacity::{
91
- estimate_content_capacity, estimate_html_markdown_capacity, estimate_presentation_capacity,
92
- estimate_spreadsheet_capacity, estimate_table_markdown_capacity,
93
- };
@@ -35,7 +35,6 @@
35
35
  pub mod app_properties;
36
36
  pub mod core_properties;
37
37
  pub mod custom_properties;
38
- pub mod odt_properties;
39
38
 
40
39
  pub use app_properties::{
41
40
  DocxAppProperties, PptxAppProperties, XlsxAppProperties, extract_docx_app_properties, extract_pptx_app_properties,
@@ -43,7 +42,6 @@ pub use app_properties::{
43
42
  };
44
43
  pub use core_properties::{CoreProperties, extract_core_properties};
45
44
  pub use custom_properties::{CustomProperties, extract_custom_properties};
46
- pub use odt_properties::{OdtProperties, extract_odt_properties};
47
45
 
48
46
  use roxmltree::Node;
49
47
 
@@ -0,0 +1,275 @@
1
+ //! Batch extraction with automatic pandoc-server mode optimization
2
+ //!
3
+ //! This module provides intelligent batch processing for Pandoc extractions with automatic
4
+ //! server mode optimization. The `BatchExtractor` automatically detects if pandoc-server is
5
+ //! available and uses it when beneficial for performance.
6
+ //!
7
+ //! # Server Mode Detection
8
+ //!
9
+ //! The extractor checks for pandoc-server availability in two ways:
10
+ //! 1. Direct binary check: Looks for `pandoc-server` in PATH
11
+ //! 2. Version detection: Checks if pandoc 3.8+ is installed (supports server mode)
12
+ //!
13
+ //! # Optimization Heuristic
14
+ //!
15
+ //! - **>3 files**: Uses server mode (amortizes ~100-200ms startup overhead)
16
+ //! - **≤3 files**: Uses subprocess mode (avoids server startup cost)
17
+ //! - **Server unavailable**: Always uses subprocess mode (graceful fallback)
18
+ //!
19
+ //! # Example Usage
20
+ //!
21
+ //! ```no_run
22
+ //! use kreuzberg::extraction::pandoc::BatchExtractor;
23
+ //! use std::path::Path;
24
+ //!
25
+ //! #[tokio::main]
26
+ //! async fn main() -> kreuzberg::Result<()> {
27
+ //! // Create extractor (auto-detects server availability)
28
+ //! let extractor = BatchExtractor::new().await;
29
+ //!
30
+ //! // Extract multiple files
31
+ //! let paths = vec![
32
+ //! Path::new("doc1.docx"),
33
+ //! Path::new("doc2.docx"),
34
+ //! Path::new("doc3.docx"),
35
+ //! Path::new("doc4.docx"),
36
+ //! ];
37
+ //! let formats = vec!["docx", "docx", "docx", "docx"];
38
+ //!
39
+ //! let results = extractor.extract_files(&paths, &formats).await?;
40
+ //!
41
+ //! // Process results
42
+ //! for (i, result) in results.iter().enumerate() {
43
+ //! match result {
44
+ //! Ok(extraction) => println!("File {}: {} chars", i, extraction.content.len()),
45
+ //! Err(e) => eprintln!("File {}: Error: {}", i, e),
46
+ //! }
47
+ //! }
48
+ //!
49
+ //! // Cleanup (optional, happens automatically on drop)
50
+ //! extractor.shutdown().await?;
51
+ //!
52
+ //! Ok(())
53
+ //! }
54
+ //! ```
55
+ //!
56
+ //! # Logging
57
+ //!
58
+ //! Enable tracing to see server mode detection and usage:
59
+ //!
60
+ //! ```bash
61
+ //! RUST_LOG=kreuzberg=debug cargo run
62
+ //! ```
63
+ //!
64
+ //! Expected logs:
65
+ //! - `DEBUG`: Server detection results, file counts
66
+ //! - `INFO`: Server startup confirmation
67
+ //! - `WARN`: Server failures with troubleshooting guidance
68
+
69
+ use crate::error::{KreuzbergError, Result};
70
+ use crate::types::PandocExtractionResult;
71
+ use serde_json::Value;
72
+ use std::path::Path;
73
+ use std::sync::Arc;
74
+ use tokio::sync::Mutex;
75
+
76
+ use super::server::PandocServer;
77
+ use super::subprocess;
78
+
79
+ /// Batch extractor with automatic server mode optimization
80
+ ///
81
+ /// Automatically detects pandoc-server availability and uses it when beneficial for
82
+ /// performance (>3 files). Falls back to subprocess mode gracefully.
83
+ ///
84
+ /// # Performance
85
+ ///
86
+ /// - **Server mode**: ~100-200ms savings per file (startup overhead eliminated)
87
+ /// - **Subprocess mode**: Lightweight, suitable for small batches
88
+ ///
89
+ /// # Thread Safety
90
+ ///
91
+ /// This struct is safe to use across multiple async tasks. The server instance is
92
+ /// protected by an Arc<Mutex> and can be safely shared.
93
+ pub struct BatchExtractor {
94
+ server: Arc<Mutex<Option<PandocServer>>>,
95
+ use_server: bool,
96
+ }
97
+
98
+ impl BatchExtractor {
99
+ /// Create a new batch extractor
100
+ ///
101
+ /// Automatically detects if pandoc-server is available and uses it if beneficial
102
+ pub async fn new() -> Self {
103
+ let use_server = PandocServer::is_server_available().await;
104
+
105
+ if use_server {
106
+ tracing::info!("Pandoc server mode available - will use for batch processing");
107
+ } else {
108
+ tracing::debug!("Pandoc server mode not available - using subprocess mode");
109
+ tracing::debug!("To enable server mode:");
110
+ tracing::debug!(" 1. Install pandoc 3.8+ (current version may be older)");
111
+ tracing::debug!(" 2. OR create symlink: ln -s $(which pandoc) /usr/local/bin/pandoc-server");
112
+ }
113
+
114
+ Self {
115
+ server: Arc::new(Mutex::new(None)),
116
+ use_server,
117
+ }
118
+ }
119
+
120
+ /// Extract multiple files using optimal strategy
121
+ ///
122
+ /// Automatically starts server if available and beneficial (>3 files)
123
+ pub async fn extract_files(
124
+ &self,
125
+ paths: &[&Path],
126
+ formats: &[&str],
127
+ ) -> Result<Vec<Result<PandocExtractionResult>>> {
128
+ if paths.is_empty() {
129
+ return Ok(vec![]);
130
+ }
131
+
132
+ if self.use_server && paths.len() > 3 {
133
+ self.extract_with_server(paths, formats).await
134
+ } else {
135
+ self.extract_with_subprocess(paths, formats).await
136
+ }
137
+ }
138
+
139
+ /// Extract using server mode (warm instance)
140
+ async fn extract_with_server(
141
+ &self,
142
+ paths: &[&Path],
143
+ formats: &[&str],
144
+ ) -> Result<Vec<Result<PandocExtractionResult>>> {
145
+ let mut server_lock = self.server.lock().await;
146
+
147
+ if server_lock.is_none() {
148
+ match PandocServer::new(None, None).await {
149
+ Ok(server) => {
150
+ if let Err(e) = server.start().await {
151
+ tracing::warn!("Failed to start pandoc-server: {}", e);
152
+ tracing::warn!("Falling back to subprocess mode");
153
+ tracing::warn!("To fix:");
154
+ tracing::warn!(" 1. Ensure pandoc 3.8+ is installed: pandoc --version");
155
+ tracing::warn!(" 2. Create symlink: ln -s $(which pandoc) /usr/local/bin/pandoc-server");
156
+ drop(server_lock);
157
+ return self.extract_with_subprocess(paths, formats).await;
158
+ }
159
+
160
+ tracing::info!("Started pandoc-server for batch processing ({} files)", paths.len());
161
+ *server_lock = Some(server);
162
+ }
163
+ Err(e) => {
164
+ tracing::warn!("Failed to create pandoc-server: {}", e);
165
+ tracing::warn!("Falling back to subprocess mode");
166
+ drop(server_lock);
167
+ return self.extract_with_subprocess(paths, formats).await;
168
+ }
169
+ }
170
+ } else {
171
+ tracing::debug!("Reusing warm pandoc-server instance for {} files", paths.len());
172
+ }
173
+
174
+ let server = server_lock.as_ref().unwrap();
175
+ let mut results = Vec::with_capacity(paths.len());
176
+
177
+ for (i, (path, format)) in paths.iter().zip(formats.iter()).enumerate() {
178
+ tracing::debug!("Extracting file {}/{} via server: {:?}", i + 1, paths.len(), path);
179
+
180
+ let content = match tokio::fs::read(path).await {
181
+ Ok(c) => c,
182
+ Err(e) => {
183
+ results.push(Err(KreuzbergError::Io(e)));
184
+ continue;
185
+ }
186
+ };
187
+
188
+ let result = match server.convert(&String::from_utf8_lossy(&content), format, "json").await {
189
+ Ok(json_output) => match serde_json::from_str::<Value>(&json_output) {
190
+ Ok(json_data) => {
191
+ let content = subprocess::extract_content_from_json(&json_data)?;
192
+ let metadata = subprocess::extract_metadata_from_json(&json_data)?;
193
+
194
+ Ok(PandocExtractionResult { content, metadata })
195
+ }
196
+ Err(e) => Err(KreuzbergError::parsing(format!(
197
+ "Failed to parse JSON from server: {}",
198
+ e
199
+ ))),
200
+ },
201
+ Err(e) => Err(e),
202
+ };
203
+
204
+ results.push(result);
205
+ }
206
+
207
+ Ok(results)
208
+ }
209
+
210
+ /// Extract using subprocess mode
211
+ async fn extract_with_subprocess(
212
+ &self,
213
+ paths: &[&Path],
214
+ formats: &[&str],
215
+ ) -> Result<Vec<Result<PandocExtractionResult>>> {
216
+ tracing::debug!("Extracting {} files via subprocess mode", paths.len());
217
+
218
+ let mut results = Vec::with_capacity(paths.len());
219
+
220
+ for (path, format) in paths.iter().zip(formats.iter()) {
221
+ let result = subprocess::extract_with_pandoc(path, format)
222
+ .await
223
+ .map(|(content, metadata)| PandocExtractionResult { content, metadata });
224
+ results.push(result);
225
+ }
226
+
227
+ Ok(results)
228
+ }
229
+
230
+ /// Stop the server if running
231
+ pub async fn shutdown(&self) -> Result<()> {
232
+ let mut server_lock = self.server.lock().await;
233
+ if let Some(server) = server_lock.take() {
234
+ tracing::info!("Shutting down pandoc-server");
235
+ server.stop().await?;
236
+ }
237
+ Ok(())
238
+ }
239
+ }
240
+
241
+ impl Drop for BatchExtractor {
242
+ fn drop(&mut self) {
243
+ if let Some(server) = self.server.try_lock().ok().and_then(|mut s| s.take()) {
244
+ tokio::spawn(async move {
245
+ let _ = server.stop().await;
246
+ });
247
+ }
248
+ }
249
+ }
250
+
251
+ #[cfg(test)]
252
+ mod tests {
253
+ use super::*;
254
+
255
+ #[tokio::test]
256
+ async fn test_batch_extractor_creation() {
257
+ let extractor = BatchExtractor::new().await;
258
+ assert!(extractor.server.lock().await.is_none());
259
+ }
260
+
261
+ #[tokio::test]
262
+ async fn test_empty_batch() {
263
+ let extractor = BatchExtractor::new().await;
264
+ let results = extractor.extract_files(&[], &[]).await;
265
+ assert!(results.is_ok());
266
+ assert!(results.unwrap().is_empty());
267
+ }
268
+
269
+ #[tokio::test]
270
+ async fn test_shutdown() {
271
+ let extractor = BatchExtractor::new().await;
272
+ let result = extractor.shutdown().await;
273
+ assert!(result.is_ok());
274
+ }
275
+ }