kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -124,6 +124,7 @@ ocr:
124
124
  fn test_from_file_nonexistent_path_fails() {
125
125
  let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
126
126
  assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
127
+ // Error can be Io or other types depending on the implementation
127
128
  }
128
129
 
129
130
  /// Test from_file with malformed TOML fails.
@@ -141,6 +142,7 @@ enabled = true
141
142
 
142
143
  let result = ExtractionConfig::from_file(&config_path);
143
144
  assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
145
+ // Error handling varies - just ensure it failed
144
146
  }
145
147
 
146
148
  /// Test from_file with malformed JSON fails.
@@ -162,6 +164,7 @@ fn test_from_file_malformed_json_fails() {
162
164
 
163
165
  let result = ExtractionConfig::from_file(&config_path);
164
166
  assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
167
+ // Error handling varies - just ensure it failed
165
168
  }
166
169
 
167
170
  /// Test from_file with malformed YAML fails.
@@ -180,6 +183,7 @@ ocr:
180
183
 
181
184
  let result = ExtractionConfig::from_file(&config_path);
182
185
  assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
186
+ // Error handling varies - just ensure it failed
183
187
  }
184
188
 
185
189
  /// Test from_file with empty file uses defaults.
@@ -194,6 +198,7 @@ fn test_from_file_empty_file_uses_defaults() {
194
198
  assert!(config.is_ok(), "Should load empty file successfully");
195
199
 
196
200
  let config = config.unwrap();
201
+ // Should have default values
197
202
  assert!(config.ocr.is_none(), "Default config should have no OCR");
198
203
  assert!(config.chunking.is_none(), "Default config should have no chunking");
199
204
  }
@@ -209,18 +214,22 @@ fn test_from_file_unsupported_extension_fails() {
209
214
  let result = ExtractionConfig::from_file(&config_path);
210
215
  assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
211
216
 
212
- if let Err(KreuzbergError::Validation { message, .. }) = result {
213
- assert!(
214
- message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
215
- "Error should mention format/extension: {}",
216
- message
217
- );
217
+ match result {
218
+ Err(KreuzbergError::Validation { message, .. }) => {
219
+ assert!(
220
+ message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
221
+ "Error should mention format/extension: {}",
222
+ message
223
+ );
224
+ }
225
+ _ => {
226
+ // Some other error is also acceptable
227
+ }
218
228
  }
219
229
  }
220
230
 
221
231
  /// Test discover() finds config in current directory.
222
232
  #[test]
223
- #[serial_test::serial]
224
233
  fn test_discover_finds_config_in_current_dir() {
225
234
  let temp_dir = TempDir::new().unwrap();
226
235
  let config_path = temp_dir.path().join("kreuzberg.toml");
@@ -232,11 +241,13 @@ enabled = true
232
241
 
233
242
  fs::write(&config_path, toml_content).unwrap();
234
243
 
244
+ // Change to temp directory
235
245
  let original_dir = std::env::current_dir().unwrap();
236
246
  std::env::set_current_dir(temp_dir.path()).unwrap();
237
247
 
238
248
  let result = ExtractionConfig::discover();
239
249
 
250
+ // Restore original directory
240
251
  std::env::set_current_dir(original_dir).unwrap();
241
252
 
242
253
  assert!(result.is_ok(), "Discover should succeed");
@@ -247,7 +258,6 @@ enabled = true
247
258
 
248
259
  /// Test discover() finds config in parent directory.
249
260
  #[test]
250
- #[serial_test::serial]
251
261
  fn test_discover_finds_config_in_parent_dir() {
252
262
  let temp_dir = TempDir::new().unwrap();
253
263
  let config_path = temp_dir.path().join("kreuzberg.toml");
@@ -259,14 +269,17 @@ enabled = true
259
269
 
260
270
  fs::write(&config_path, toml_content).unwrap();
261
271
 
272
+ // Create subdirectory
262
273
  let sub_dir = temp_dir.path().join("subdir");
263
274
  fs::create_dir(&sub_dir).unwrap();
264
275
 
276
+ // Change to subdirectory
265
277
  let original_dir = std::env::current_dir().unwrap();
266
278
  std::env::set_current_dir(&sub_dir).unwrap();
267
279
 
268
280
  let result = ExtractionConfig::discover();
269
281
 
282
+ // Restore original directory
270
283
  std::env::set_current_dir(original_dir).unwrap();
271
284
 
272
285
  assert!(result.is_ok(), "Discover should succeed");
@@ -277,39 +290,44 @@ enabled = true
277
290
 
278
291
  /// Test discover() returns None when no config found.
279
292
  #[test]
280
- #[serial_test::serial]
281
293
  fn test_discover_returns_none_when_not_found() {
282
294
  let temp_dir = TempDir::new().unwrap();
283
295
  let sub_dir = temp_dir.path().join("subdir");
284
296
  fs::create_dir(&sub_dir).unwrap();
285
297
 
298
+ // Change to subdirectory (no config files)
286
299
  let original_dir = std::env::current_dir().unwrap();
287
300
  std::env::set_current_dir(&sub_dir).unwrap();
288
301
 
289
302
  let result = ExtractionConfig::discover();
290
303
 
304
+ // Restore original directory
291
305
  std::env::set_current_dir(original_dir).unwrap();
292
306
 
293
307
  assert!(result.is_ok(), "Discover should succeed even when no config found");
294
308
  let _config = result.unwrap();
309
+ // May return None or may find a config in parent directories (e.g., repository root)
310
+ // Just verify it doesn't error - the specific behavior depends on the directory structure
295
311
  }
296
312
 
297
313
  /// Test discover() prefers certain file names.
298
314
  #[test]
299
- #[serial_test::serial]
300
315
  fn test_discover_file_name_preference() {
301
316
  let temp_dir = TempDir::new().unwrap();
302
317
 
318
+ // Create multiple config files
303
319
  fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").unwrap();
304
320
  fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").unwrap();
305
321
 
306
322
  let original_dir = std::env::current_dir().unwrap();
307
323
  if std::env::set_current_dir(temp_dir.path()).is_err() {
324
+ // Skip this test if we can't change directory
308
325
  return;
309
326
  }
310
327
 
311
328
  let result = ExtractionConfig::discover();
312
329
 
330
+ // Always restore directory even if test fails
313
331
  let _ = std::env::set_current_dir(original_dir);
314
332
 
315
333
  assert!(result.is_ok(), "Discover should succeed");
@@ -319,7 +337,6 @@ fn test_discover_file_name_preference() {
319
337
 
320
338
  /// Test discover() with nested directories.
321
339
  #[test]
322
- #[serial_test::serial]
323
340
  fn test_discover_with_nested_directories() {
324
341
  let temp_dir = TempDir::new().unwrap();
325
342
  let config_path = temp_dir.path().join("kreuzberg.toml");
@@ -331,18 +348,22 @@ enabled = true
331
348
 
332
349
  fs::write(&config_path, toml_content).unwrap();
333
350
 
351
+ // Create nested subdirectories
334
352
  let level1 = temp_dir.path().join("level1");
335
353
  let level2 = level1.join("level2");
336
354
  let level3 = level2.join("level3");
337
355
  fs::create_dir_all(&level3).unwrap();
338
356
 
357
+ // Change to deepest directory
339
358
  let original_dir = std::env::current_dir().unwrap();
340
359
  if std::env::set_current_dir(&level3).is_err() {
360
+ // Skip this test if we can't change directory
341
361
  return;
342
362
  }
343
363
 
344
364
  let result = ExtractionConfig::discover();
345
365
 
366
+ // Always restore directory even if test fails
346
367
  let _ = std::env::set_current_dir(&original_dir);
347
368
 
348
369
  assert!(result.is_ok(), "Discover should succeed");
@@ -389,7 +410,6 @@ extract_images = true
389
410
  "Should have language detection config"
390
411
  );
391
412
  assert!(config.images.is_some(), "Should have image extraction config");
392
- #[cfg(feature = "pdf")]
393
413
  assert!(config.pdf_options.is_some(), "Should have PDF config");
394
414
  }
395
415
 
@@ -399,6 +419,7 @@ fn test_from_file_with_invalid_values() {
399
419
  let temp_dir = TempDir::new().unwrap();
400
420
  let config_path = temp_dir.path().join("config.toml");
401
421
 
422
+ // Negative values should be rejected during deserialization or validation
402
423
  let toml_content = r#"
403
424
  [chunking]
404
425
  max_chars = -1000
@@ -408,9 +429,11 @@ max_overlap = -100
408
429
  fs::write(&config_path, toml_content).unwrap();
409
430
 
410
431
  let result = ExtractionConfig::from_file(&config_path);
411
- if let Ok(config) = result
412
- && let Some(chunking) = config.chunking
413
- {
414
- assert!(chunking.max_chars > 0, "max_chars should be positive");
432
+ // Should either fail parsing or have clamped values
433
+ if let Ok(config) = result {
434
+ // If it succeeds, values should be reasonable
435
+ if let Some(chunking) = config.chunking {
436
+ assert!(chunking.max_chars > 0, "max_chars should be positive");
437
+ }
415
438
  }
416
439
  }
@@ -11,18 +11,6 @@ use std::fs::{self, File};
11
11
  use std::io::Write;
12
12
  use tempfile::tempdir;
13
13
 
14
- fn trim_trailing_newlines(value: &str) -> &str {
15
- value.trim_end_matches(['\n', '\r'])
16
- }
17
-
18
- fn assert_text_content(actual: &str, expected: &str) {
19
- assert_eq!(
20
- trim_trailing_newlines(actual),
21
- expected,
22
- "Content mismatch after trimming trailing newlines"
23
- );
24
- }
25
-
26
14
  /// Test basic file extraction with MIME detection.
27
15
  #[tokio::test]
28
16
  async fn test_extract_file_basic() {
@@ -37,7 +25,7 @@ async fn test_extract_file_basic() {
37
25
  assert!(result.is_ok(), "Basic file extraction should succeed");
38
26
  let result = result.unwrap();
39
27
 
40
- assert_text_content(&result.content, "Hello, Kreuzberg!");
28
+ assert_eq!(result.content, "Hello, Kreuzberg!");
41
29
  assert_eq!(result.mime_type, "text/plain");
42
30
  assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
43
31
  assert!(result.detected_languages.is_none(), "Language detection not enabled");
@@ -185,12 +173,7 @@ async fn test_batch_extract_bytes_concurrency() {
185
173
  (b"content 5".as_slice(), "text/plain"),
186
174
  ];
187
175
 
188
- let owned_contents: Vec<(Vec<u8>, String)> = contents
189
- .into_iter()
190
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
191
- .collect();
192
-
193
- let results = batch_extract_bytes(owned_contents, &config).await;
176
+ let results = batch_extract_bytes(contents, &config).await;
194
177
  assert!(results.is_ok());
195
178
 
196
179
  let results = results.unwrap();
@@ -198,12 +181,7 @@ async fn test_batch_extract_bytes_concurrency() {
198
181
 
199
182
  for (i, result) in results.iter().enumerate() {
200
183
  let expected_content = format!("content {}", i + 1);
201
- assert_eq!(
202
- trim_trailing_newlines(&result.content),
203
- expected_content,
204
- "Content mismatch for item {}",
205
- i
206
- );
184
+ assert_eq!(result.content, expected_content, "Content mismatch for item {}", i);
207
185
  assert_eq!(result.mime_type, "text/plain", "MIME type should be text/plain");
208
186
  assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
209
187
  assert!(result.detected_languages.is_none(), "Language detection not enabled");
@@ -223,13 +201,13 @@ fn test_sync_wrappers() {
223
201
  let result = extract_file_sync(&file_path, None, &config);
224
202
  assert!(result.is_ok(), "Sync file extraction should succeed");
225
203
  let extraction = result.unwrap();
226
- assert_text_content(&extraction.content, "sync content");
204
+ assert_eq!(extraction.content, "sync content");
227
205
  assert!(extraction.chunks.is_none(), "Chunks should be None");
228
206
 
229
207
  let result = extract_bytes_sync(b"test bytes", "text/plain", &config);
230
208
  assert!(result.is_ok(), "Sync bytes extraction should succeed");
231
209
  let extraction = result.unwrap();
232
- assert_text_content(&extraction.content, "test bytes");
210
+ assert_eq!(extraction.content, "test bytes");
233
211
  assert!(extraction.chunks.is_none(), "Chunks should be None");
234
212
 
235
213
  let paths = vec![file_path];
@@ -237,19 +215,15 @@ fn test_sync_wrappers() {
237
215
  assert!(results.is_ok(), "Batch sync file should succeed");
238
216
  let results = results.unwrap();
239
217
  assert_eq!(results.len(), 1);
240
- assert_text_content(&results[0].content, "sync content");
218
+ assert_eq!(results[0].content, "sync content");
241
219
  assert!(results[0].chunks.is_none(), "Chunks should be None");
242
220
 
243
221
  let contents = vec![(b"test".as_slice(), "text/plain")];
244
- let owned_contents: Vec<(Vec<u8>, String)> = contents
245
- .into_iter()
246
- .map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
247
- .collect();
248
- let results = batch_extract_bytes_sync(owned_contents, &config);
222
+ let results = batch_extract_bytes_sync(contents, &config);
249
223
  assert!(results.is_ok(), "Batch bytes sync should succeed");
250
224
  let results = results.unwrap();
251
225
  assert_eq!(results.len(), 1);
252
- assert_text_content(&results[0].content, "test");
226
+ assert_eq!(results[0].content, "test");
253
227
  assert!(results[0].chunks.is_none(), "Chunks should be None");
254
228
  }
255
229
 
@@ -441,7 +415,7 @@ async fn test_pipeline_execution() {
441
415
  assert!(result.is_ok(), "Pipeline execution should succeed");
442
416
 
443
417
  let result = result.unwrap();
444
- assert_text_content(&result.content, "pipeline content");
418
+ assert_eq!(result.content, "pipeline content");
445
419
  assert_eq!(result.mime_type, "text/plain");
446
420
  assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
447
421
  assert!(result.detected_languages.is_none(), "Language detection not enabled");
@@ -1,6 +1,6 @@
1
1
  //! CSV and spreadsheet integration tests.
2
2
  //!
3
- //! Tests for CSV and TSV extraction.
3
+ //! Tests for CSV and TSV extraction via Pandoc.
4
4
  //! Validates data extraction, custom delimiters, quoted fields, and edge cases.
5
5
 
6
6
  use kreuzberg::core::config::ExtractionConfig;
@@ -15,13 +15,14 @@ async fn test_csv_basic_extraction() {
15
15
 
16
16
  let csv_content = b"Name,Age,City\nAlice,30,NYC\nBob,25,LA";
17
17
 
18
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
19
- Ok(result) => result,
20
- Err(_) => {
21
- println!("Skipping test: CSV extraction not available");
22
- return;
23
- }
24
- };
18
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
19
+
20
+ if result.is_err() {
21
+ println!("Skipping test: Pandoc may not be installed");
22
+ return;
23
+ }
24
+
25
+ let extraction = result.unwrap();
25
26
 
26
27
  assert_eq!(extraction.mime_type, "text/csv");
27
28
  assert!(
@@ -54,13 +55,14 @@ async fn test_csv_with_headers() {
54
55
 
55
56
  let csv_content = b"Product,Price,Quantity\nApple,1.50,100\nBanana,0.75,200\nOrange,2.00,150";
56
57
 
57
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
58
- Ok(result) => result,
59
- Err(_) => {
60
- println!("Skipping test: CSV extraction not available");
61
- return;
62
- }
63
- };
58
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
59
+
60
+ if result.is_err() {
61
+ println!("Skipping test: Pandoc may not be installed");
62
+ return;
63
+ }
64
+
65
+ let extraction = result.unwrap();
64
66
 
65
67
  assert!(
66
68
  extraction.chunks.is_none(),
@@ -103,13 +105,14 @@ async fn test_csv_custom_delimiter() {
103
105
 
104
106
  let csv_content = b"Name;Age;City\nAlice;30;NYC\nBob;25;LA";
105
107
 
106
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
107
- Ok(result) => result,
108
- Err(_) => {
109
- println!("Skipping test: CSV extraction not available");
110
- return;
111
- }
112
- };
108
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
109
+
110
+ if result.is_err() {
111
+ println!("Skipping test: Pandoc may not be installed");
112
+ return;
113
+ }
114
+
115
+ let extraction = result.unwrap();
113
116
 
114
117
  assert!(
115
118
  extraction.chunks.is_none(),
@@ -135,13 +138,14 @@ async fn test_tsv_file() {
135
138
 
136
139
  let tsv_content = b"Name\tAge\tCity\nAlice\t30\tNYC\nBob\t25\tLA";
137
140
 
138
- let extraction = match extract_bytes(tsv_content, "text/tab-separated-values", &config).await {
139
- Ok(result) => result,
140
- Err(_) => {
141
- println!("Skipping test: TSV extraction not available");
142
- return;
143
- }
144
- };
141
+ let result = extract_bytes(tsv_content, "text/tab-separated-values", &config).await;
142
+
143
+ if result.is_err() {
144
+ println!("Skipping test: Pandoc may not be installed");
145
+ return;
146
+ }
147
+
148
+ let extraction = result.unwrap();
145
149
 
146
150
  assert_eq!(extraction.mime_type, "text/tab-separated-values");
147
151
  assert!(
@@ -171,13 +175,14 @@ async fn test_csv_quoted_fields() {
171
175
  let csv_content =
172
176
  b"Name,Description,Price\n\"Smith, John\",\"Product A, premium\",100\n\"Doe, Jane\",\"Product B, standard\",50";
173
177
 
174
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
175
- Ok(result) => result,
176
- Err(_) => {
177
- println!("Skipping test: CSV extraction not available");
178
- return;
179
- }
180
- };
178
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
179
+
180
+ if result.is_err() {
181
+ println!("Skipping test: Pandoc may not be installed");
182
+ return;
183
+ }
184
+
185
+ let extraction = result.unwrap();
181
186
 
182
187
  assert!(
183
188
  extraction.chunks.is_none(),
@@ -207,13 +212,14 @@ async fn test_csv_special_characters() {
207
212
 
208
213
  let csv_content = "Name,City,Emoji\nAlice,Tokyo 東京,🎉\nBob,París,✅\nCarlos,Москва,🌍".as_bytes();
209
214
 
210
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
211
- Ok(result) => result,
212
- Err(_) => {
213
- println!("Skipping test: CSV extraction not available");
214
- return;
215
- }
216
- };
215
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
216
+
217
+ if result.is_err() {
218
+ println!("Skipping test: Pandoc may not be installed");
219
+ return;
220
+ }
221
+
222
+ let extraction = result.unwrap();
217
223
 
218
224
  assert!(
219
225
  extraction.chunks.is_none(),
@@ -245,13 +251,14 @@ async fn test_csv_large_file() {
245
251
  csv_content.push_str(&format!("{},Item{},{}.00\n", i, i, i * 10));
246
252
  }
247
253
 
248
- let extraction = match extract_bytes(csv_content.as_bytes(), "text/csv", &config).await {
249
- Ok(result) => result,
250
- Err(_) => {
251
- println!("Skipping test: CSV extraction not available");
252
- return;
253
- }
254
- };
254
+ let result = extract_bytes(csv_content.as_bytes(), "text/csv", &config).await;
255
+
256
+ if result.is_err() {
257
+ println!("Skipping test: Pandoc may not be installed");
258
+ return;
259
+ }
260
+
261
+ let extraction = result.unwrap();
255
262
 
256
263
  assert!(
257
264
  extraction.chunks.is_none(),
@@ -315,13 +322,14 @@ async fn test_csv_headers_only() {
315
322
 
316
323
  let csv_content = b"Name,Age,City";
317
324
 
318
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
319
- Ok(result) => result,
320
- Err(_) => {
321
- println!("Skipping test: CSV extraction not available");
322
- return;
323
- }
324
- };
325
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
326
+
327
+ if result.is_err() {
328
+ println!("Skipping test: Pandoc may not be installed");
329
+ return;
330
+ }
331
+
332
+ let extraction = result.unwrap();
325
333
 
326
334
  assert!(
327
335
  extraction.chunks.is_none(),
@@ -346,13 +354,14 @@ async fn test_csv_blank_lines() {
346
354
 
347
355
  let csv_content = b"Name,Age\nAlice,30\n\nBob,25\n\nCarlos,35";
348
356
 
349
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
350
- Ok(result) => result,
351
- Err(_) => {
352
- println!("Skipping test: CSV extraction not available");
353
- return;
354
- }
355
- };
357
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
358
+
359
+ if result.is_err() {
360
+ println!("Skipping test: Pandoc may not be installed");
361
+ return;
362
+ }
363
+
364
+ let extraction = result.unwrap();
356
365
 
357
366
  assert!(
358
367
  extraction.chunks.is_none(),
@@ -374,13 +383,14 @@ async fn test_csv_numeric_data() {
374
383
 
375
384
  let csv_content = b"ID,Price,Quantity,Discount\n1,19.99,100,0.15\n2,29.99,50,0.20\n3,9.99,200,0.10";
376
385
 
377
- let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
378
- Ok(result) => result,
379
- Err(_) => {
380
- println!("Skipping test: CSV extraction not available");
381
- return;
382
- }
383
- };
386
+ let result = extract_bytes(csv_content, "text/csv", &config).await;
387
+
388
+ if result.is_err() {
389
+ println!("Skipping test: Pandoc may not be installed");
390
+ return;
391
+ }
392
+
393
+ let extraction = result.unwrap();
384
394
 
385
395
  assert!(
386
396
  extraction.chunks.is_none(),
@@ -2,10 +2,15 @@
2
2
 
3
3
  #![cfg(feature = "office")]
4
4
 
5
- use kreuzberg::{ExtractionConfig, extract_file};
5
+ use kreuzberg::extraction::pandoc::extract_file;
6
6
 
7
7
  #[tokio::test]
8
8
  async fn test_docx_full_metadata_extraction() {
9
+ if kreuzberg::extraction::pandoc::validate_pandoc_version().await.is_err() {
10
+ println!("Skipping test: Pandoc not available");
11
+ return;
12
+ }
13
+
9
14
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
10
15
  .parent()
11
16
  .unwrap()
@@ -18,7 +23,7 @@ async fn test_docx_full_metadata_extraction() {
18
23
  return;
19
24
  }
20
25
 
21
- let result = extract_file(&test_file, None, &ExtractionConfig::default())
26
+ let result = extract_file(&test_file, "docx")
22
27
  .await
23
28
  .expect("Should extract DOCX successfully");
24
29
 
@@ -29,66 +34,63 @@ async fn test_docx_full_metadata_extraction() {
29
34
  );
30
35
 
31
36
  assert_eq!(
32
- result.metadata.additional.get("created_by").and_then(|v| v.as_str()),
37
+ result.metadata.get("created_by").and_then(|v| v.as_str()),
33
38
  Some("Christoph Auer"),
34
39
  "Should have correct creator"
35
40
  );
36
41
  assert_eq!(
37
- result.metadata.additional.get("modified_by").and_then(|v| v.as_str()),
42
+ result.metadata.get("modified_by").and_then(|v| v.as_str()),
38
43
  Some("Maxim Lysak"),
39
44
  "Should have correct last modified by"
40
45
  );
41
46
  assert_eq!(
42
- result.metadata.additional.get("created_at").and_then(|v| v.as_str()),
47
+ result.metadata.get("created_at").and_then(|v| v.as_str()),
43
48
  Some("2024-10-09T12:43:00Z"),
44
49
  "Should have correct creation date"
45
50
  );
46
51
  assert_eq!(
47
- result.metadata.additional.get("revision").and_then(|v| v.as_str()),
52
+ result.metadata.get("revision").and_then(|v| v.as_str()),
48
53
  Some("7"),
49
54
  "Should have revision number"
50
55
  );
51
56
 
52
57
  assert_eq!(
53
- result.metadata.additional.get("page_count").and_then(|v| v.as_i64()),
58
+ result.metadata.get("page_count").and_then(|v| v.as_i64()),
54
59
  Some(2),
55
60
  "Should have 2 pages"
56
61
  );
57
62
  assert_eq!(
58
- result.metadata.additional.get("word_count").and_then(|v| v.as_i64()),
63
+ result.metadata.get("word_count").and_then(|v| v.as_i64()),
59
64
  Some(108),
60
65
  "Should have 108 words"
61
66
  );
62
67
  assert_eq!(
63
- result
64
- .metadata
65
- .additional
66
- .get("character_count")
67
- .and_then(|v| v.as_i64()),
68
+ result.metadata.get("character_count").and_then(|v| v.as_i64()),
68
69
  Some(620),
69
70
  "Should have 620 characters"
70
71
  );
71
72
  assert_eq!(
72
- result.metadata.additional.get("line_count").and_then(|v| v.as_i64()),
73
+ result.metadata.get("line_count").and_then(|v| v.as_i64()),
73
74
  Some(5),
74
75
  "Should have 5 lines"
75
76
  );
76
77
  assert_eq!(
77
- result
78
- .metadata
79
- .additional
80
- .get("paragraph_count")
81
- .and_then(|v| v.as_i64()),
78
+ result.metadata.get("paragraph_count").and_then(|v| v.as_i64()),
82
79
  Some(1),
83
80
  "Should have 1 paragraph"
84
81
  );
85
82
 
86
83
  println!("✅ DOCX metadata extraction test passed!");
87
- println!(" Found {} metadata fields", result.metadata.additional.len());
84
+ println!(" Found {} metadata fields", result.metadata.len());
88
85
  }
89
86
 
90
87
  #[tokio::test]
91
88
  async fn test_docx_minimal_metadata_extraction() {
89
+ if kreuzberg::extraction::pandoc::validate_pandoc_version().await.is_err() {
90
+ println!("Skipping test: Pandoc not available");
91
+ return;
92
+ }
93
+
92
94
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
93
95
  .parent()
94
96
  .unwrap()
@@ -101,19 +103,19 @@ async fn test_docx_minimal_metadata_extraction() {
101
103
  return;
102
104
  }
103
105
 
104
- let result = extract_file(&test_file, None, &ExtractionConfig::default())
106
+ let result = extract_file(&test_file, "docx")
105
107
  .await
106
108
  .expect("Should extract DOCX successfully");
107
109
 
108
110
  assert!(!result.content.is_empty(), "Content should not be empty");
109
111
 
110
112
  assert_eq!(
111
- result.metadata.additional.get("page_count").and_then(|v| v.as_i64()),
113
+ result.metadata.get("page_count").and_then(|v| v.as_i64()),
112
114
  Some(1),
113
115
  "Should have 1 page"
114
116
  );
115
117
  assert_eq!(
116
- result.metadata.additional.get("word_count").and_then(|v| v.as_i64()),
118
+ result.metadata.get("word_count").and_then(|v| v.as_i64()),
117
119
  Some(520),
118
120
  "Should have 520 words"
119
121
  );