kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -4,62 +4,9 @@
4
4
  //! quality processing, chunking, and custom hooks in the correct order.
5
5
 
6
6
  use crate::core::config::ExtractionConfig;
7
- use crate::plugins::{PostProcessor, ProcessingStage};
7
+ use crate::plugins::ProcessingStage;
8
8
  use crate::types::ExtractionResult;
9
9
  use crate::{KreuzbergError, Result};
10
- use once_cell::sync::Lazy;
11
- use std::sync::Arc;
12
- use std::sync::RwLock as StdRwLock;
13
-
14
- /// Cached post-processors for each stage to reduce lock contention.
15
- ///
16
- /// This cache is populated once during the first pipeline run and reused
17
- /// for all subsequent extractions, eliminating 3 of 4 registry lock acquisitions
18
- /// per extraction.
19
- struct ProcessorCache {
20
- early: Arc<Vec<Arc<dyn PostProcessor>>>,
21
- middle: Arc<Vec<Arc<dyn PostProcessor>>>,
22
- late: Arc<Vec<Arc<dyn PostProcessor>>>,
23
- }
24
-
25
- impl ProcessorCache {
26
- /// Create a new processor cache by fetching from the registry.
27
- fn new() -> Result<Self> {
28
- let processor_registry = crate::plugins::registry::get_post_processor_registry();
29
- let registry = processor_registry
30
- .read()
31
- .map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
32
-
33
- Ok(Self {
34
- early: Arc::new(registry.get_for_stage(ProcessingStage::Early)),
35
- middle: Arc::new(registry.get_for_stage(ProcessingStage::Middle)),
36
- late: Arc::new(registry.get_for_stage(ProcessingStage::Late)),
37
- })
38
- }
39
-
40
- /// Get processors for a specific stage from cache.
41
- #[allow(dead_code)]
42
- fn get_for_stage(&self, stage: ProcessingStage) -> Arc<Vec<Arc<dyn PostProcessor>>> {
43
- match stage {
44
- ProcessingStage::Early => Arc::clone(&self.early),
45
- ProcessingStage::Middle => Arc::clone(&self.middle),
46
- ProcessingStage::Late => Arc::clone(&self.late),
47
- }
48
- }
49
- }
50
-
51
- /// Lazy processor cache - initialized on first use, then cached.
52
- static PROCESSOR_CACHE: Lazy<StdRwLock<Option<ProcessorCache>>> = Lazy::new(|| StdRwLock::new(None));
53
-
54
- /// Clear the processor cache (primarily for testing when registry changes).
55
- #[allow(dead_code)]
56
- pub fn clear_processor_cache() -> Result<()> {
57
- let mut cache = PROCESSOR_CACHE
58
- .write()
59
- .map_err(|e| crate::KreuzbergError::Other(format!("Processor cache lock poisoned: {}", e)))?;
60
- *cache = None;
61
- Ok(())
62
- }
63
10
 
64
11
  /// Run the post-processing pipeline on an extraction result.
65
12
  ///
@@ -83,13 +30,6 @@ pub fn clear_processor_cache() -> Result<()> {
83
30
  /// - Validator errors bubble up immediately
84
31
  /// - Post-processor errors are caught and recorded in metadata
85
32
  /// - System errors (IO, RuntimeError equivalents) always bubble up
86
- #[cfg_attr(feature = "otel", tracing::instrument(
87
- skip(result, config),
88
- fields(
89
- pipeline.stage = "post_processing",
90
- content.length = result.content.len(),
91
- )
92
- ))]
93
33
  pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfig) -> Result<ExtractionResult> {
94
34
  let pp_config = config.postprocessor.as_ref();
95
35
  let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);
@@ -100,61 +40,21 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
100
40
  let _ = crate::keywords::ensure_initialized();
101
41
  }
102
42
 
103
- #[cfg(feature = "language-detection")]
104
- {
105
- let _ = crate::language_detection::ensure_initialized();
106
- }
107
-
108
- #[cfg(feature = "chunking")]
109
- {
110
- let _ = crate::chunking::ensure_initialized();
111
- }
112
-
113
- #[cfg(feature = "quality")]
114
- {
115
- let registry = crate::plugins::registry::get_post_processor_registry();
116
- if let Ok(mut reg) = registry.write() {
117
- let _ = reg.register(std::sync::Arc::new(crate::text::QualityProcessor), 30);
118
- }
119
- }
120
-
121
- {
122
- let mut cache_lock = PROCESSOR_CACHE
123
- .write()
124
- .map_err(|e| crate::KreuzbergError::Other(format!("Processor cache lock poisoned: {}", e)))?;
125
- if cache_lock.is_none() {
126
- *cache_lock = Some(ProcessorCache::new()?);
127
- }
128
- }
43
+ let processor_registry = crate::plugins::registry::get_post_processor_registry();
129
44
 
130
- let (early_processors, middle_processors, late_processors) = {
131
- let cache_lock = PROCESSOR_CACHE
132
- .read()
133
- .map_err(|e| crate::KreuzbergError::Other(format!("Processor cache lock poisoned: {}", e)))?;
134
- let cache = cache_lock
135
- .as_ref()
136
- .ok_or_else(|| crate::KreuzbergError::Other("Processor cache not initialized".to_string()))?;
137
- (
138
- Arc::clone(&cache.early),
139
- Arc::clone(&cache.middle),
140
- Arc::clone(&cache.late),
141
- )
142
- };
45
+ for stage in [ProcessingStage::Early, ProcessingStage::Middle, ProcessingStage::Late] {
46
+ let processors = {
47
+ let registry = processor_registry.read().map_err(|e| {
48
+ crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e))
49
+ })?;
50
+ registry.get_for_stage(stage)
51
+ };
143
52
 
144
- for (_stage, processors_arc) in [
145
- (ProcessingStage::Early, early_processors),
146
- (ProcessingStage::Middle, middle_processors),
147
- (ProcessingStage::Late, late_processors),
148
- ] {
149
- for processor in processors_arc.iter() {
53
+ for processor in processors {
150
54
  let processor_name = processor.name();
151
55
 
152
56
  let should_run = if let Some(config) = pp_config {
153
- if let Some(ref enabled_set) = config.enabled_set {
154
- enabled_set.contains(processor_name)
155
- } else if let Some(ref disabled_set) = config.disabled_set {
156
- !disabled_set.contains(processor_name)
157
- } else if let Some(ref enabled) = config.enabled_processors {
57
+ if let Some(ref enabled) = config.enabled_processors {
158
58
  enabled.iter().any(|name| name == processor_name)
159
59
  } else if let Some(ref disabled) = config.disabled_processors {
160
60
  !disabled.iter().any(|name| name == processor_name)
@@ -185,6 +85,35 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
185
85
  }
186
86
  }
187
87
 
88
+ #[cfg(feature = "quality")]
89
+ if config.enable_quality_processing {
90
+ let quality_score = crate::text::quality::calculate_quality_score(
91
+ &result.content,
92
+ Some(
93
+ &result
94
+ .metadata
95
+ .additional
96
+ .iter()
97
+ .map(|(k, v)| (k.clone(), v.to_string()))
98
+ .collect(),
99
+ ),
100
+ );
101
+ result.metadata.additional.insert(
102
+ "quality_score".to_string(),
103
+ serde_json::Value::Number(
104
+ serde_json::Number::from_f64(quality_score).unwrap_or(serde_json::Number::from(0)),
105
+ ),
106
+ );
107
+ }
108
+
109
+ #[cfg(not(feature = "quality"))]
110
+ if config.enable_quality_processing {
111
+ result.metadata.additional.insert(
112
+ "quality_processing_error".to_string(),
113
+ serde_json::Value::String("Quality processing feature not enabled".to_string()),
114
+ );
115
+ }
116
+
188
117
  #[cfg(feature = "chunking")]
189
118
  if let Some(ref chunking_config) = config.chunking {
190
119
  let chunk_config = crate::chunking::ChunkingConfig {
@@ -194,9 +123,7 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
194
123
  chunker_type: crate::chunking::ChunkerType::Text,
195
124
  };
196
125
 
197
- let page_boundaries = result.metadata.pages.as_ref().and_then(|ps| ps.boundaries.as_deref());
198
-
199
- match crate::chunking::chunk_text(&result.content, &chunk_config, page_boundaries) {
126
+ match crate::chunking::chunk_text(&result.content, &chunk_config) {
200
127
  Ok(chunking_result) => {
201
128
  result.chunks = Some(chunking_result.chunks);
202
129
 
@@ -284,11 +211,9 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
284
211
  registry.get_all()
285
212
  };
286
213
 
287
- if !validators.is_empty() {
288
- for validator in validators {
289
- if validator.should_validate(&result, config) {
290
- validator.validate(&result, config).await?;
291
- }
214
+ for validator in validators {
215
+ if validator.should_validate(&result, config) {
216
+ validator.validate(&result, config).await?;
292
217
  }
293
218
  }
294
219
  }
@@ -296,144 +221,19 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
296
221
  Ok(result)
297
222
  }
298
223
 
299
- /// Run the post-processing pipeline synchronously (WASM-compatible version).
300
- ///
301
- /// This is a synchronous implementation for WASM and non-async contexts.
302
- /// It performs a subset of the full async pipeline, excluding async post-processors
303
- /// and validators.
304
- ///
305
- /// # Arguments
306
- ///
307
- /// * `result` - The extraction result to process
308
- /// * `config` - Extraction configuration
309
- ///
310
- /// # Returns
311
- ///
312
- /// The processed extraction result.
313
- ///
314
- /// # Notes
315
- ///
316
- /// This function is only available when the `tokio-runtime` feature is disabled.
317
- /// It handles:
318
- /// - Quality processing (if enabled)
319
- /// - Chunking (if enabled)
320
- /// - Language detection (if enabled)
321
- ///
322
- /// It does NOT handle:
323
- /// - Async post-processors
324
- /// - Async validators
325
- #[cfg(not(feature = "tokio-runtime"))]
326
- pub fn run_pipeline_sync(mut result: ExtractionResult, config: &ExtractionConfig) -> Result<ExtractionResult> {
327
- #[cfg(feature = "chunking")]
328
- if let Some(ref chunking_config) = config.chunking {
329
- let chunk_config = crate::chunking::ChunkingConfig {
330
- max_characters: chunking_config.max_chars,
331
- overlap: chunking_config.max_overlap,
332
- trim: true,
333
- chunker_type: crate::chunking::ChunkerType::Text,
334
- };
335
-
336
- match crate::chunking::chunk_text(&result.content, &chunk_config, None) {
337
- Ok(chunking_result) => {
338
- result.chunks = Some(chunking_result.chunks);
339
-
340
- if let Some(ref chunks) = result.chunks {
341
- result.metadata.additional.insert(
342
- "chunk_count".to_string(),
343
- serde_json::Value::Number(serde_json::Number::from(chunks.len())),
344
- );
345
- }
346
-
347
- #[cfg(feature = "embeddings")]
348
- if let Some(ref embedding_config) = chunking_config.embedding
349
- && let Some(ref mut chunks) = result.chunks
350
- {
351
- match crate::embeddings::generate_embeddings_for_chunks(chunks, embedding_config) {
352
- Ok(()) => {
353
- result
354
- .metadata
355
- .additional
356
- .insert("embeddings_generated".to_string(), serde_json::Value::Bool(true));
357
- }
358
- Err(e) => {
359
- result
360
- .metadata
361
- .additional
362
- .insert("embedding_error".to_string(), serde_json::Value::String(e.to_string()));
363
- }
364
- }
365
- }
366
-
367
- #[cfg(not(feature = "embeddings"))]
368
- if chunking_config.embedding.is_some() {
369
- result.metadata.additional.insert(
370
- "embedding_error".to_string(),
371
- serde_json::Value::String("Embeddings feature not enabled".to_string()),
372
- );
373
- }
374
- }
375
- Err(e) => {
376
- result
377
- .metadata
378
- .additional
379
- .insert("chunking_error".to_string(), serde_json::Value::String(e.to_string()));
380
- }
381
- }
382
- }
383
-
384
- #[cfg(not(feature = "chunking"))]
385
- if config.chunking.is_some() {
386
- result.metadata.additional.insert(
387
- "chunking_error".to_string(),
388
- serde_json::Value::String("Chunking feature not enabled".to_string()),
389
- );
390
- }
391
-
392
- #[cfg(feature = "language-detection")]
393
- if let Some(ref lang_config) = config.language_detection {
394
- match crate::language_detection::detect_languages(&result.content, lang_config) {
395
- Ok(detected) => {
396
- result.detected_languages = detected;
397
- }
398
- Err(e) => {
399
- result.metadata.additional.insert(
400
- "language_detection_error".to_string(),
401
- serde_json::Value::String(e.to_string()),
402
- );
403
- }
404
- }
405
- }
406
-
407
- #[cfg(not(feature = "language-detection"))]
408
- if config.language_detection.is_some() {
409
- result.metadata.additional.insert(
410
- "language_detection_error".to_string(),
411
- serde_json::Value::String("Language detection feature not enabled".to_string()),
412
- );
413
- }
414
-
415
- Ok(result)
416
- }
417
-
418
224
  #[cfg(test)]
419
225
  mod tests {
420
226
  use super::*;
421
227
  use crate::types::Metadata;
422
228
  use lazy_static::lazy_static;
423
229
 
424
- const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
425
- #[cfg(feature = "quality")]
426
- const QUALITY_VALIDATION_MARKER: &str = "quality_validation_test";
427
- const POSTPROCESSOR_VALIDATION_MARKER: &str = "postprocessor_validation_test";
428
- const ORDER_VALIDATION_MARKER: &str = "order_validation_test";
429
-
430
230
  lazy_static! {
431
231
  static ref REGISTRY_TEST_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(());
432
232
  }
433
233
 
434
234
  #[tokio::test]
435
235
  async fn test_run_pipeline_basic() {
436
- let mut result = ExtractionResult {
236
+ let result = ExtractionResult {
437
237
  content: "test".to_string(),
438
238
  mime_type: "text/plain".to_string(),
439
239
  metadata: Metadata::default(),
@@ -441,12 +241,7 @@ mod tests {
441
241
  detected_languages: None,
442
242
  chunks: None,
443
243
  images: None,
444
- pages: None,
445
244
  };
446
- result.metadata.additional.insert(
447
- VALIDATION_MARKER_KEY.to_string(),
448
- serde_json::json!(ORDER_VALIDATION_MARKER),
449
- );
450
245
  let config = ExtractionConfig::default();
451
246
 
452
247
  let processed = run_pipeline(result, &config).await.unwrap();
@@ -464,7 +259,6 @@ mod tests {
464
259
  detected_languages: None,
465
260
  chunks: None,
466
261
  images: None,
467
- pages: None,
468
262
  };
469
263
  let config = ExtractionConfig {
470
264
  enable_quality_processing: true,
@@ -485,7 +279,6 @@ mod tests {
485
279
  detected_languages: None,
486
280
  chunks: None,
487
281
  images: None,
488
- pages: None,
489
282
  };
490
283
  let config = ExtractionConfig {
491
284
  enable_quality_processing: false,
@@ -507,7 +300,6 @@ mod tests {
507
300
  detected_languages: None,
508
301
  chunks: None,
509
302
  images: None,
510
- pages: None,
511
303
  };
512
304
  let config = ExtractionConfig {
513
305
  chunking: Some(crate::ChunkingConfig {
@@ -535,7 +327,6 @@ mod tests {
535
327
  detected_languages: None,
536
328
  chunks: None,
537
329
  images: None,
538
- pages: None,
539
330
  };
540
331
  let config = ExtractionConfig {
541
332
  chunking: None,
@@ -560,7 +351,6 @@ mod tests {
560
351
  additional,
561
352
  ..Default::default()
562
353
  },
563
- pages: None,
564
354
  tables: vec![],
565
355
  detected_languages: None,
566
356
  chunks: None,
@@ -597,7 +387,6 @@ mod tests {
597
387
  detected_languages: None,
598
388
  chunks: None,
599
389
  images: None,
600
- pages: None,
601
390
  };
602
391
  let config = ExtractionConfig::default();
603
392
 
@@ -608,17 +397,9 @@ mod tests {
608
397
 
609
398
  #[tokio::test]
610
399
  async fn test_pipeline_empty_content() {
611
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
612
-
613
400
  {
614
- let registry = crate::plugins::registry::get_post_processor_registry();
615
- registry.write().unwrap().shutdown_all().unwrap();
616
- }
617
- {
618
- let registry = crate::plugins::registry::get_validator_registry();
619
- registry.write().unwrap().shutdown_all().unwrap();
620
- }
621
-
401
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
402
+ } // Drop guard before async operations
622
403
  let result = ExtractionResult {
623
404
  content: String::new(),
624
405
  mime_type: "text/plain".to_string(),
@@ -627,12 +408,9 @@ mod tests {
627
408
  detected_languages: None,
628
409
  chunks: None,
629
410
  images: None,
630
- pages: None,
631
411
  };
632
412
  let config = ExtractionConfig::default();
633
413
 
634
- drop(_guard);
635
-
636
414
  let processed = run_pipeline(result, &config).await.unwrap();
637
415
  assert_eq!(processed.content, "");
638
416
  }
@@ -648,7 +426,6 @@ mod tests {
648
426
  detected_languages: None,
649
427
  chunks: None,
650
428
  images: None,
651
- pages: None,
652
429
  };
653
430
  let config = ExtractionConfig {
654
431
  enable_quality_processing: true,
@@ -669,22 +446,6 @@ mod tests {
669
446
  #[tokio::test]
670
447
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
671
448
  async fn test_pipeline_with_keyword_extraction() {
672
- {
673
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
674
- crate::plugins::registry::get_validator_registry()
675
- .write()
676
- .unwrap()
677
- .shutdown_all()
678
- .unwrap();
679
- crate::plugins::registry::get_post_processor_registry()
680
- .write()
681
- .unwrap()
682
- .shutdown_all()
683
- .unwrap();
684
-
685
- let _ = crate::keywords::register_keyword_processor();
686
- }
687
-
688
449
  let result = ExtractionResult {
689
450
  content: r#"
690
451
  Machine learning is a branch of artificial intelligence that focuses on
@@ -699,7 +460,6 @@ Natural language processing enables computers to understand human language.
699
460
  detected_languages: None,
700
461
  chunks: None,
701
462
  images: None,
702
- pages: None,
703
463
  };
704
464
 
705
465
  #[cfg(feature = "keywords-yake")]
@@ -733,9 +493,6 @@ Natural language processing enables computers to understand human language.
733
493
  #[tokio::test]
734
494
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
735
495
  async fn test_pipeline_without_keyword_config() {
736
- {
737
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
738
- }
739
496
  let result = ExtractionResult {
740
497
  content: "Machine learning and artificial intelligence.".to_string(),
741
498
  mime_type: "text/plain".to_string(),
@@ -744,7 +501,6 @@ Natural language processing enables computers to understand human language.
744
501
  detected_languages: None,
745
502
  chunks: None,
746
503
  images: None,
747
- pages: None,
748
504
  };
749
505
 
750
506
  let config = ExtractionConfig {
@@ -760,18 +516,6 @@ Natural language processing enables computers to understand human language.
760
516
  #[tokio::test]
761
517
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
762
518
  async fn test_pipeline_keyword_extraction_short_content() {
763
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
764
- crate::plugins::registry::get_validator_registry()
765
- .write()
766
- .unwrap()
767
- .shutdown_all()
768
- .unwrap();
769
- crate::plugins::registry::get_post_processor_registry()
770
- .write()
771
- .unwrap()
772
- .shutdown_all()
773
- .unwrap();
774
-
775
519
  let result = ExtractionResult {
776
520
  content: "Short text".to_string(),
777
521
  mime_type: "text/plain".to_string(),
@@ -780,7 +524,6 @@ Natural language processing enables computers to understand human language.
780
524
  detected_languages: None,
781
525
  chunks: None,
782
526
  images: None,
783
- pages: None,
784
527
  };
785
528
 
786
529
  #[cfg(feature = "keywords-yake")]
@@ -794,8 +537,6 @@ Natural language processing enables computers to understand human language.
794
537
  ..Default::default()
795
538
  };
796
539
 
797
- drop(_guard);
798
-
799
540
  let processed = run_pipeline(result, &config).await.unwrap();
800
541
 
801
542
  assert!(!processed.metadata.additional.contains_key("keywords"));
@@ -803,6 +544,9 @@ Natural language processing enables computers to understand human language.
803
544
 
804
545
  #[tokio::test]
805
546
  async fn test_postprocessor_runs_before_validator() {
547
+ {
548
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
549
+ } // Drop guard before async operations
806
550
  use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
807
551
  use async_trait::async_trait;
808
552
  use std::sync::Arc;
@@ -857,17 +601,6 @@ Natural language processing enables computers to understand human language.
857
601
  #[async_trait]
858
602
  impl Validator for TestValidator {
859
603
  async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
860
- let should_validate = result
861
- .metadata
862
- .additional
863
- .get(VALIDATION_MARKER_KEY)
864
- .and_then(|v| v.as_str())
865
- == Some(POSTPROCESSOR_VALIDATION_MARKER);
866
-
867
- if !should_validate {
868
- return Ok(());
869
- }
870
-
871
604
  let processed = result
872
605
  .metadata
873
606
  .additional
@@ -886,28 +619,18 @@ Natural language processing enables computers to understand human language.
886
619
  }
887
620
 
888
621
  let pp_registry = crate::plugins::registry::get_post_processor_registry();
889
- let val_registry = crate::plugins::registry::get_validator_registry();
890
-
891
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
892
- clear_processor_cache().unwrap();
893
- pp_registry.write().unwrap().shutdown_all().unwrap();
894
- val_registry.write().unwrap().shutdown_all().unwrap();
895
- clear_processor_cache().unwrap();
896
-
897
622
  {
898
623
  let mut registry = pp_registry.write().unwrap();
899
624
  registry.register(Arc::new(TestPostProcessor), 0).unwrap();
900
625
  }
901
626
 
627
+ let val_registry = crate::plugins::registry::get_validator_registry();
902
628
  {
903
629
  let mut registry = val_registry.write().unwrap();
904
630
  registry.register(Arc::new(TestValidator)).unwrap();
905
631
  }
906
632
 
907
- // Clear the cache after registering new processors so it rebuilds with the test processors
908
- clear_processor_cache().unwrap();
909
-
910
- let mut result = ExtractionResult {
633
+ let result = ExtractionResult {
911
634
  content: "test".to_string(),
912
635
  mime_type: "text/plain".to_string(),
913
636
  metadata: Metadata::default(),
@@ -915,29 +638,19 @@ Natural language processing enables computers to understand human language.
915
638
  detected_languages: None,
916
639
  chunks: None,
917
640
  images: None,
918
- pages: None,
919
- };
920
- result.metadata.additional.insert(
921
- VALIDATION_MARKER_KEY.to_string(),
922
- serde_json::json!(POSTPROCESSOR_VALIDATION_MARKER),
923
- );
924
-
925
- let config = ExtractionConfig {
926
- postprocessor: Some(crate::core::config::PostProcessorConfig {
927
- enabled: true,
928
- enabled_set: None,
929
- disabled_set: None,
930
- enabled_processors: None,
931
- disabled_processors: None,
932
- }),
933
- ..Default::default()
934
641
  };
935
- drop(_guard);
936
642
 
643
+ let config = ExtractionConfig::default();
937
644
  let processed = run_pipeline(result, &config).await;
938
645
 
939
- pp_registry.write().unwrap().shutdown_all().unwrap();
940
- val_registry.write().unwrap().shutdown_all().unwrap();
646
+ {
647
+ let mut registry = pp_registry.write().unwrap();
648
+ registry.remove("test-processor").unwrap();
649
+ }
650
+ {
651
+ let mut registry = val_registry.write().unwrap();
652
+ registry.remove("test-validator").unwrap();
653
+ }
941
654
 
942
655
  assert!(processed.is_ok(), "Validator should have seen post-processor metadata");
943
656
  let processed = processed.unwrap();
@@ -951,7 +664,9 @@ Natural language processing enables computers to understand human language.
951
664
  #[tokio::test]
952
665
  #[cfg(feature = "quality")]
953
666
  async fn test_quality_processing_runs_before_validator() {
954
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
667
+ {
668
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
669
+ } // Drop guard before async operations
955
670
  use crate::plugins::{Plugin, Validator};
956
671
  use async_trait::async_trait;
957
672
  use std::sync::Arc;
@@ -975,17 +690,6 @@ Natural language processing enables computers to understand human language.
975
690
  #[async_trait]
976
691
  impl Validator for QualityValidator {
977
692
  async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
978
- let should_validate = result
979
- .metadata
980
- .additional
981
- .get(VALIDATION_MARKER_KEY)
982
- .and_then(|v| v.as_str())
983
- == Some(QUALITY_VALIDATION_MARKER);
984
-
985
- if !should_validate {
986
- return Ok(());
987
- }
988
-
989
693
  if !result.metadata.additional.contains_key("quality_score") {
990
694
  return Err(crate::KreuzbergError::Validation {
991
695
  message: "Quality processing did not run before validator".to_string(),
@@ -1002,7 +706,7 @@ Natural language processing enables computers to understand human language.
1002
706
  registry.register(Arc::new(QualityValidator)).unwrap();
1003
707
  }
1004
708
 
1005
- let mut result = ExtractionResult {
709
+ let result = ExtractionResult {
1006
710
  content: "This is meaningful test content for quality scoring.".to_string(),
1007
711
  mime_type: "text/plain".to_string(),
1008
712
  metadata: Metadata::default(),
@@ -1010,20 +714,13 @@ Natural language processing enables computers to understand human language.
1010
714
  detected_languages: None,
1011
715
  chunks: None,
1012
716
  images: None,
1013
- pages: None,
1014
717
  };
1015
- result.metadata.additional.insert(
1016
- VALIDATION_MARKER_KEY.to_string(),
1017
- serde_json::json!(QUALITY_VALIDATION_MARKER),
1018
- );
1019
718
 
1020
719
  let config = ExtractionConfig {
1021
720
  enable_quality_processing: true,
1022
721
  ..Default::default()
1023
722
  };
1024
723
 
1025
- drop(_guard);
1026
-
1027
724
  let processed = run_pipeline(result, &config).await;
1028
725
 
1029
726
  {
@@ -1036,6 +733,9 @@ Natural language processing enables computers to understand human language.
1036
733
 
1037
734
  #[tokio::test]
1038
735
  async fn test_multiple_postprocessors_run_before_validator() {
736
+ {
737
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
738
+ } // Drop guard before async operations
1039
739
  use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
1040
740
  use async_trait::async_trait;
1041
741
  use std::sync::Arc;
@@ -1137,17 +837,6 @@ Natural language processing enables computers to understand human language.
1137
837
  #[async_trait]
1138
838
  impl Validator for OrderValidator {
1139
839
  async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
1140
- let should_validate = result
1141
- .metadata
1142
- .additional
1143
- .get(VALIDATION_MARKER_KEY)
1144
- .and_then(|v| v.as_str())
1145
- == Some(ORDER_VALIDATION_MARKER);
1146
-
1147
- if !should_validate {
1148
- return Ok(());
1149
- }
1150
-
1151
840
  let order = result
1152
841
  .metadata
1153
842
  .additional
@@ -1177,27 +866,18 @@ Natural language processing enables computers to understand human language.
1177
866
  }
1178
867
 
1179
868
  let pp_registry = crate::plugins::registry::get_post_processor_registry();
1180
- let val_registry = crate::plugins::registry::get_validator_registry();
1181
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
1182
-
1183
- pp_registry.write().unwrap().shutdown_all().unwrap();
1184
- val_registry.write().unwrap().shutdown_all().unwrap();
1185
- clear_processor_cache().unwrap();
1186
-
1187
869
  {
1188
870
  let mut registry = pp_registry.write().unwrap();
1189
871
  registry.register(Arc::new(EarlyProcessor), 0).unwrap();
1190
872
  registry.register(Arc::new(LateProcessor), 0).unwrap();
1191
873
  }
1192
874
 
875
+ let val_registry = crate::plugins::registry::get_validator_registry();
1193
876
  {
1194
877
  let mut registry = val_registry.write().unwrap();
1195
878
  registry.register(Arc::new(OrderValidator)).unwrap();
1196
879
  }
1197
880
 
1198
- // Clear the cache after registering new processors so it rebuilds with the test processors
1199
- clear_processor_cache().unwrap();
1200
-
1201
881
  let result = ExtractionResult {
1202
882
  content: "test".to_string(),
1203
883
  mime_type: "text/plain".to_string(),
@@ -1206,17 +886,20 @@ Natural language processing enables computers to understand human language.
1206
886
  detected_languages: None,
1207
887
  chunks: None,
1208
888
  images: None,
1209
- pages: None,
1210
889
  };
1211
890
 
1212
891
  let config = ExtractionConfig::default();
1213
- drop(_guard);
1214
-
1215
892
  let processed = run_pipeline(result, &config).await;
1216
893
 
1217
- pp_registry.write().unwrap().shutdown_all().unwrap();
1218
- val_registry.write().unwrap().shutdown_all().unwrap();
1219
- clear_processor_cache().unwrap();
894
+ {
895
+ let mut registry = pp_registry.write().unwrap();
896
+ registry.remove("early-proc").unwrap();
897
+ registry.remove("late-proc").unwrap();
898
+ }
899
+ {
900
+ let mut registry = val_registry.write().unwrap();
901
+ registry.remove("order-validator").unwrap();
902
+ }
1220
903
 
1221
904
  assert!(processed.is_ok(), "All processors should run before validator");
1222
905
  }