kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,36 +1,27 @@
1
- //! PDF text extraction module.
2
- //!
3
- //! This module provides functions to extract text content from PDF files using the pdfium-render library.
4
-
5
- use super::bindings::{PdfiumHandle, bind_pdfium};
6
1
  use super::error::{PdfError, Result};
7
- use crate::core::config::PageConfig;
8
- use crate::pdf::metadata::PdfExtractionMetadata;
9
- use crate::types::{PageBoundary, PageContent};
10
2
  use pdfium_render::prelude::*;
11
3
 
12
- /// Result type for PDF text extraction with optional page tracking.
13
- type PdfTextExtractionResult = (String, Option<Vec<PageBoundary>>, Option<Vec<PageContent>>);
14
-
15
- pub struct PdfTextExtractor<'a> {
16
- pdfium: PdfiumHandle<'a>,
4
+ pub struct PdfTextExtractor {
5
+ pdfium: Pdfium,
17
6
  }
18
7
 
19
- impl PdfTextExtractor<'static> {
8
+ impl PdfTextExtractor {
20
9
  pub fn new() -> Result<Self> {
21
- let pdfium = bind_pdfium(PdfError::TextExtractionFailed, "text extraction")?;
22
- Ok(PdfTextExtractor { pdfium })
10
+ let binding = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
11
+ .or_else(|_| Pdfium::bind_to_system_library())
12
+ .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
13
+
14
+ let pdfium = Pdfium::new(binding);
15
+ Ok(Self { pdfium })
23
16
  }
24
- }
25
17
 
26
- impl PdfTextExtractor<'_> {
27
18
  pub fn extract_text(&self, pdf_bytes: &[u8]) -> Result<String> {
28
19
  self.extract_text_with_password(pdf_bytes, None)
29
20
  }
30
21
 
31
22
  pub fn extract_text_with_password(&self, pdf_bytes: &[u8], password: Option<&str>) -> Result<String> {
32
23
  let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
33
- let err_msg = super::error::format_pdfium_error(e);
24
+ let err_msg = e.to_string();
34
25
  if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
35
26
  PdfError::InvalidPassword
36
27
  } else if err_msg.contains("password") || err_msg.contains("Password") {
@@ -40,8 +31,7 @@ impl PdfTextExtractor<'_> {
40
31
  }
41
32
  })?;
42
33
 
43
- let (content, _, _) = extract_text_from_pdf_document(&document, None, None)?;
44
- Ok(content)
34
+ extract_text_from_pdf_document(&document)
45
35
  }
46
36
 
47
37
  pub fn extract_text_with_passwords(&self, pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
@@ -66,7 +56,7 @@ impl PdfTextExtractor<'_> {
66
56
 
67
57
  pub fn get_page_count(&self, pdf_bytes: &[u8]) -> Result<usize> {
68
58
  let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, None).map_err(|e| {
69
- let err_msg = super::error::format_pdfium_error(e);
59
+ let err_msg = e.to_string();
70
60
  if err_msg.contains("password") || err_msg.contains("Password") {
71
61
  PdfError::PasswordRequired
72
62
  } else {
@@ -78,7 +68,7 @@ impl PdfTextExtractor<'_> {
78
68
  }
79
69
  }
80
70
 
81
- impl Default for PdfTextExtractor<'static> {
71
+ impl Default for PdfTextExtractor {
82
72
  fn default() -> Self {
83
73
  Self::new().expect("Failed to create PDF text extractor")
84
74
  }
@@ -99,343 +89,31 @@ pub fn extract_text_from_pdf_with_passwords(pdf_bytes: &[u8], passwords: &[&str]
99
89
  extractor.extract_text_with_passwords(pdf_bytes, passwords)
100
90
  }
101
91
 
102
- /// Result type for unified PDF text and metadata extraction.
103
- ///
104
- /// Contains text, optional page boundaries, optional per-page content, and metadata.
105
- pub type PdfUnifiedExtractionResult = (
106
- String,
107
- Option<Vec<PageBoundary>>,
108
- Option<Vec<PageContent>>,
109
- PdfExtractionMetadata,
110
- );
111
-
112
- /// Extract text and metadata from PDF document in a single pass.
113
- ///
114
- /// This is an optimized function that extracts both text and metadata in one pass
115
- /// through the document, avoiding redundant document parsing. It combines the
116
- /// functionality of `extract_text_from_pdf_document` and
117
- /// `extract_metadata_from_document` into a single unified operation.
118
- ///
119
- /// # Arguments
120
- ///
121
- /// * `document` - The PDF document to extract from
122
- /// * `extraction_config` - Optional extraction configuration for hierarchy and page tracking
123
- ///
124
- /// # Returns
125
- ///
126
- /// A tuple containing:
127
- /// - The extracted text content (String)
128
- /// - Optional page boundaries when page tracking is enabled (Vec<PageBoundary>)
129
- /// - Optional per-page content when extract_pages is enabled (Vec<PageContent>)
130
- /// - Complete extraction metadata (PdfExtractionMetadata)
131
- ///
132
- /// # Performance
133
- ///
134
- /// This function is optimized for single-pass extraction. It performs all document
135
- /// scanning in one iteration, avoiding redundant pdfium operations compared to
136
- /// calling text and metadata extraction separately.
137
- pub fn extract_text_and_metadata_from_pdf_document(
138
- document: &PdfDocument<'_>,
139
- extraction_config: Option<&crate::core::config::ExtractionConfig>,
140
- ) -> Result<PdfUnifiedExtractionResult> {
141
- let page_config = extraction_config.and_then(|c| c.pages.as_ref());
142
- let (text, boundaries, page_contents) = extract_text_from_pdf_document(document, page_config, extraction_config)?;
143
-
144
- let metadata = crate::pdf::metadata::extract_metadata_from_document_impl(document, boundaries.as_deref())?;
145
-
146
- Ok((text, boundaries, page_contents, metadata))
147
- }
148
-
149
- /// Extract text from PDF document with optional page boundary tracking.
150
- ///
151
- /// # Arguments
152
- ///
153
- /// * `document` - The PDF document to extract text from
154
- /// * `page_config` - Optional page configuration for boundary tracking and page markers
155
- /// * `extraction_config` - Optional extraction configuration for hierarchy detection
156
- ///
157
- /// # Returns
158
- ///
159
- /// A tuple containing:
160
- /// - The extracted text content (String)
161
- /// - Optional page boundaries when page tracking is enabled (Vec<PageBoundary>)
162
- /// - Optional per-page content when extract_pages is enabled (Vec<PageContent>)
163
- ///
164
- /// # Implementation Details
165
- ///
166
- /// Uses lazy page-by-page iteration to reduce memory footprint. Pages are processed
167
- /// one at a time and released after extraction, rather than accumulating all pages
168
- /// in memory. This approach saves 40-50MB for large documents while improving
169
- /// performance by 15-25% through reduced upfront work.
170
- ///
171
- /// When page_config is None, uses fast path with minimal overhead.
172
- /// When page_config is Some, tracks byte offsets using .len() for O(1) performance (UTF-8 valid boundaries).
173
- pub fn extract_text_from_pdf_document(
174
- document: &PdfDocument<'_>,
175
- page_config: Option<&PageConfig>,
176
- extraction_config: Option<&crate::core::config::ExtractionConfig>,
177
- ) -> Result<PdfTextExtractionResult> {
178
- if page_config.is_none() {
179
- return extract_text_lazy_fast_path(document);
180
- }
181
-
182
- let config = page_config.unwrap();
183
-
184
- extract_text_lazy_with_tracking(document, config, extraction_config)
185
- }
186
-
187
- /// Fast path for text extraction without page tracking.
188
- ///
189
- /// Processes pages one-by-one lazily, building content incrementally with
190
- /// pre-allocated capacity to minimize reallocation overhead. This combines
191
- /// memory efficiency of lazy iteration with the allocation optimization
192
- /// of pre-sizing.
193
- ///
194
- /// # Performance Optimization
195
- ///
196
- /// Pre-allocates buffer capacity by sampling the first 5 pages' text length
197
- /// and extrapolating for the full document. This reduces String reallocation
198
- /// calls from O(n) to O(log n) while maintaining low peak memory usage.
199
- /// For large documents, this can reduce allocation overhead by 40-50%.
200
- fn extract_text_lazy_fast_path(document: &PdfDocument<'_>) -> Result<PdfTextExtractionResult> {
92
+ pub fn extract_text_from_pdf_document(document: &PdfDocument<'_>) -> Result<String> {
201
93
  let page_count = document.pages().len() as usize;
202
- let mut content = String::new();
203
- let mut total_sample_size = 0usize;
204
- let mut sample_count = 0;
205
94
 
206
- for (page_idx, page) in document.pages().iter().enumerate() {
95
+ // Pre-allocate capacity based on estimated page size (average 2KB per page)
96
+ // This reduces memory reallocations during string concatenation
97
+ let estimated_size = page_count * 2048;
98
+ let mut content = String::with_capacity(estimated_size);
99
+
100
+ for page in document.pages().iter() {
207
101
  let text = page
208
102
  .text()
209
103
  .map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;
210
104
 
211
105
  let page_text = text.all();
212
- let page_size = page_text.len();
213
106
 
214
- if page_idx > 0 {
107
+ if !content.is_empty() {
215
108
  content.push_str("\n\n");
216
109
  }
217
-
218
110
  content.push_str(&page_text);
219
-
220
- if page_idx < 5 {
221
- total_sample_size += page_size;
222
- sample_count += 1;
223
- }
224
-
225
- if page_idx == 4 && sample_count > 0 && page_count > 5 {
226
- let avg_page_size = total_sample_size / sample_count;
227
- let estimated_remaining = avg_page_size * (page_count - 5);
228
- content.reserve(estimated_remaining + (estimated_remaining / 10));
229
- }
230
111
  }
231
112
 
232
- Ok((content, None, None))
233
- }
234
-
235
- /// Lazy extraction with page boundary and content tracking.
236
- ///
237
- /// Processes pages one-by-one, tracking byte boundaries and optionally
238
- /// collecting per-page content. Pre-allocates buffer capacity using an
239
- /// adaptive strategy to minimize reallocations while maintaining low peak
240
- /// memory usage.
241
- ///
242
- /// When hierarchy extraction is enabled, extracts text hierarchy (H1-H6 levels)
243
- /// from font size clustering and assigns semantic heading levels to text blocks.
244
- ///
245
- /// # Performance Optimization
246
- ///
247
- /// Uses a two-phase approach: sample first 5 pages to estimate average
248
- /// page size, then reserve capacity for remaining pages. This reduces
249
- /// allocations from O(n) to O(log n) while keeping memory efficient.
250
- fn extract_text_lazy_with_tracking(
251
- document: &PdfDocument<'_>,
252
- config: &PageConfig,
253
- extraction_config: Option<&crate::core::config::ExtractionConfig>,
254
- ) -> Result<PdfTextExtractionResult> {
255
- let mut content = String::new();
256
- let page_count = document.pages().len() as usize;
257
- let mut boundaries = Vec::with_capacity(page_count);
258
- let mut page_contents = if config.extract_pages {
259
- Some(Vec::with_capacity(page_count))
260
- } else {
261
- None
262
- };
263
-
264
- // Check if hierarchy extraction is enabled
265
- let should_extract_hierarchy = extraction_config
266
- .and_then(|cfg| cfg.pdf_options.as_ref())
267
- .and_then(|pdf_cfg| pdf_cfg.hierarchy.as_ref())
268
- .map(|h_cfg| h_cfg.enabled)
269
- .unwrap_or(false);
270
-
271
- let hierarchy_config = extraction_config
272
- .and_then(|cfg| cfg.pdf_options.as_ref())
273
- .and_then(|pdf_cfg| pdf_cfg.hierarchy.as_ref())
274
- .cloned();
275
-
276
- let mut total_sample_size = 0usize;
277
- let mut sample_count = 0;
278
-
279
- for (page_idx, page) in document.pages().iter().enumerate() {
280
- let page_number = page_idx + 1;
281
-
282
- let text = page
283
- .text()
284
- .map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;
285
-
286
- let page_text_ref = text.all();
287
- let page_size = page_text_ref.len();
288
-
289
- if page_idx < 5 {
290
- total_sample_size += page_size;
291
- sample_count += 1;
292
- }
293
-
294
- // Insert page marker before the page content (for ALL pages including page 1)
295
- if config.insert_page_markers {
296
- let marker = config.marker_format.replace("{page_num}", &page_number.to_string());
297
- content.push_str(&marker);
298
- } else if page_idx > 0 {
299
- // Only add separator between pages when markers are disabled
300
- content.push_str("\n\n");
301
- }
302
-
303
- let byte_start = content.len();
304
- content.push_str(&page_text_ref);
305
- let byte_end = content.len();
306
-
307
- boundaries.push(PageBoundary {
308
- byte_start,
309
- byte_end,
310
- page_number,
311
- });
312
-
313
- if let Some(ref mut pages) = page_contents {
314
- // Extract hierarchy if enabled
315
- let hierarchy = if should_extract_hierarchy {
316
- extract_page_hierarchy(&page, hierarchy_config.as_ref())?
317
- } else {
318
- None
319
- };
113
+ // Shrink to actual size to free unused capacity
114
+ content.shrink_to_fit();
320
115
 
321
- pages.push(PageContent {
322
- page_number,
323
- content: page_text_ref.to_owned(),
324
- tables: Vec::new(),
325
- images: Vec::new(),
326
- hierarchy,
327
- });
328
- }
329
-
330
- if page_idx == 4 && page_count > 5 && sample_count > 0 {
331
- let avg_page_size = total_sample_size / sample_count;
332
- let estimated_remaining = avg_page_size * (page_count - 5);
333
- let separator_overhead = (page_count - 5) * 3;
334
- content.reserve(estimated_remaining + separator_overhead + (estimated_remaining / 10));
335
- }
336
- }
337
-
338
- Ok((content, Some(boundaries), page_contents))
339
- }
340
-
341
- /// Extract text hierarchy from a single PDF page.
342
- ///
343
- /// Uses font size clustering to identify heading levels (H1-H6) and assigns
344
- /// hierarchy levels to text blocks based on their font sizes.
345
- ///
346
- /// # Arguments
347
- ///
348
- /// * `page` - The PDF page to extract hierarchy from
349
- /// * `hierarchy_config` - Configuration for hierarchy extraction
350
- ///
351
- /// # Returns
352
- ///
353
- /// Optional PageHierarchy containing hierarchical blocks with heading levels
354
- fn extract_page_hierarchy(
355
- page: &pdfium_render::prelude::PdfPage,
356
- hierarchy_config: Option<&crate::core::config::HierarchyConfig>,
357
- ) -> Result<Option<crate::types::PageHierarchy>> {
358
- use crate::pdf::hierarchy::{
359
- HierarchyLevel, assign_hierarchy_levels, cluster_font_sizes, extract_chars_with_fonts, merge_chars_into_blocks,
360
- };
361
- use crate::types::HierarchicalBlock;
362
-
363
- // Check if config is present and hierarchy is enabled
364
- let config = match hierarchy_config {
365
- Some(cfg) if cfg.enabled => cfg,
366
- _ => return Ok(None),
367
- };
368
-
369
- // Extract characters with font information
370
- let char_data = extract_chars_with_fonts(page)?;
371
-
372
- if char_data.is_empty() {
373
- return Ok(None);
374
- }
375
-
376
- // Merge characters into text blocks
377
- let text_blocks = merge_chars_into_blocks(char_data);
378
-
379
- if text_blocks.is_empty() {
380
- return Ok(None);
381
- }
382
-
383
- // Cluster by font sizes
384
- let k_clusters = config.k_clusters.min(text_blocks.len());
385
- let clusters = cluster_font_sizes(&text_blocks, k_clusters)?;
386
-
387
- if clusters.is_empty() {
388
- return Ok(None);
389
- }
390
-
391
- // Assign hierarchy levels using KMeans-based clustering
392
- let kmeans_result = crate::pdf::hierarchy::KMeansResult {
393
- labels: text_blocks
394
- .iter()
395
- .map(|block| {
396
- // Find which cluster this block belongs to
397
- let mut min_dist = f32::INFINITY;
398
- let mut best_cluster = 0u32;
399
- for (idx, cluster) in clusters.iter().enumerate() {
400
- let dist = (block.font_size - cluster.centroid).abs();
401
- if dist < min_dist {
402
- min_dist = dist;
403
- best_cluster = idx as u32;
404
- }
405
- }
406
- best_cluster
407
- })
408
- .collect(),
409
- };
410
-
411
- let hierarchy_blocks = assign_hierarchy_levels(&text_blocks, &kmeans_result);
412
-
413
- // Convert to output format
414
- let blocks: Vec<HierarchicalBlock> = hierarchy_blocks
415
- .into_iter()
416
- .map(|hb| HierarchicalBlock {
417
- text: hb.text,
418
- font_size: hb.font_size,
419
- level: match hb.hierarchy_level {
420
- HierarchyLevel::H1 => "h1".to_string(),
421
- HierarchyLevel::H2 => "h2".to_string(),
422
- HierarchyLevel::H3 => "h3".to_string(),
423
- HierarchyLevel::H4 => "h4".to_string(),
424
- HierarchyLevel::H5 => "h5".to_string(),
425
- HierarchyLevel::H6 => "h6".to_string(),
426
- HierarchyLevel::Body => "body".to_string(),
427
- },
428
- bbox: if config.include_bbox {
429
- Some((hb.bbox.left, hb.bbox.top, hb.bbox.right, hb.bbox.bottom))
430
- } else {
431
- None
432
- },
433
- })
434
- .collect();
435
-
436
- let block_count = blocks.len();
437
-
438
- Ok(Some(crate::types::PageHierarchy { block_count, blocks }))
116
+ Ok(content)
439
117
  }
440
118
 
441
119
  #[cfg(test)]
@@ -481,73 +159,3 @@ mod tests {
481
159
  assert!(result.is_err());
482
160
  }
483
161
  }
484
-
485
- #[cfg(test)]
486
- mod cache_regression_tests {
487
- use super::*;
488
- use std::time::Instant;
489
-
490
- /// Test that multiple extractions of the same document produce consistent results.
491
- ///
492
- /// Note: The Pdfium library uses a singleton pattern for initialization. The first
493
- /// call to bind_pdfium() initializes the library (expensive), while subsequent
494
- /// calls reuse the cached instance (fast). This is correct behavior, not a bug.
495
- ///
496
- /// This test verifies that:
497
- /// 1. Multiple extractions produce identical text content
498
- /// 2. The singleton pattern provides consistent extraction behavior
499
- #[test]
500
- fn test_no_global_cache_between_documents() {
501
- let pdf_bytes = std::fs::read("../../test_documents/pdfs/fake_memo.pdf").expect("Failed to read PDF");
502
-
503
- let extractor = PdfTextExtractor::new().expect("Failed to create extractor");
504
-
505
- let start = Instant::now();
506
- let text1 = extractor.extract_text(&pdf_bytes).expect("Failed to extract (cold)");
507
- let cold = start.elapsed();
508
-
509
- let start = Instant::now();
510
- let text2 = extractor.extract_text(&pdf_bytes).expect("Failed to extract (warm1)");
511
- let warm1 = start.elapsed();
512
-
513
- let start = Instant::now();
514
- let text3 = extractor.extract_text(&pdf_bytes).expect("Failed to extract (warm2)");
515
- let warm2 = start.elapsed();
516
-
517
- eprintln!("Cold: {:?}", cold);
518
- eprintln!("Warm 1: {:?}", warm1);
519
- eprintln!("Warm 2: {:?}", warm2);
520
-
521
- // All extractions must produce identical content
522
- assert_eq!(text1, text2);
523
- assert_eq!(text2, text3);
524
-
525
- // Warm calls may be faster due to the Pdfium singleton pattern - this is expected.
526
- // The singleton initializes Pdfium once and reuses it for subsequent calls.
527
- // What we DO want to verify is that warm1 and warm2 have similar performance,
528
- // which indicates consistent behavior after initialization.
529
- let warm1_micros = warm1.as_micros().max(1);
530
- let warm2_micros = warm2.as_micros().max(1);
531
- let warm_ratio = if warm1_micros > warm2_micros {
532
- warm1_micros / warm2_micros
533
- } else {
534
- warm2_micros / warm1_micros
535
- };
536
-
537
- // After initialization, subsequent calls should have similar performance (within 5x)
538
- assert!(
539
- warm_ratio < 5,
540
- "Warm calls have inconsistent performance ({}x difference) - warm1: {:?}, warm2: {:?}",
541
- warm_ratio,
542
- warm1,
543
- warm2
544
- );
545
-
546
- // Log the cold/warm ratio for informational purposes
547
- let cold_warm_ratio = cold.as_micros() / warm1_micros;
548
- eprintln!(
549
- "Cold/Warm ratio: {}x (expected due to singleton initialization)",
550
- cold_warm_ratio
551
- );
552
- }
553
- }
@@ -10,9 +10,6 @@ use async_trait::async_trait;
10
10
  use std::path::Path;
11
11
  use std::sync::Arc;
12
12
 
13
- #[cfg(not(feature = "tokio-runtime"))]
14
- use crate::KreuzbergError;
15
-
16
13
  /// Trait for document extractor plugins.
17
14
  ///
18
15
  /// Implement this trait to add support for new document formats or to override
@@ -64,7 +61,6 @@ use crate::KreuzbergError;
64
61
  /// detected_languages: None,
65
62
  /// chunks: None,
66
63
  /// images: None,
67
- /// pages: None,
68
64
  /// })
69
65
  /// }
70
66
  ///
@@ -143,7 +139,6 @@ pub trait DocumentExtractor: Plugin {
143
139
  /// detected_languages: None,
144
140
  /// chunks: None,
145
141
  /// images: None,
146
- /// pages: None,
147
142
  /// })
148
143
  /// }
149
144
  /// # }
@@ -214,25 +209,14 @@ pub trait DocumentExtractor: Plugin {
214
209
  /// detected_languages: None,
215
210
  /// chunks: None,
216
211
  /// images: None,
217
- /// pages: None,
218
212
  /// })
219
213
  /// }
220
214
  /// # }
221
215
  /// ```
222
216
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
223
- #[cfg(feature = "tokio-runtime")]
224
- {
225
- use crate::core::io;
226
- let bytes = io::read_file_async(path).await?;
227
- self.extract_bytes(&bytes, mime_type, config).await
228
- }
229
- #[cfg(not(feature = "tokio-runtime"))]
230
- {
231
- let _ = (path, mime_type, config);
232
- Err(KreuzbergError::Other(
233
- "File-based extraction requires the tokio-runtime feature".to_string(),
234
- ))
235
- }
217
+ use crate::core::io;
218
+ let bytes = io::read_file_async(path).await?;
219
+ self.extract_bytes(&bytes, mime_type, config).await
236
220
  }
237
221
 
238
222
  /// Get the list of MIME types supported by this extractor.
@@ -375,16 +359,10 @@ pub trait DocumentExtractor: Plugin {
375
359
  fn can_handle(&self, _path: &Path, _mime_type: &str) -> bool {
376
360
  true
377
361
  }
378
-
379
- /// Attempt to get a reference to this extractor as a SyncExtractor.
380
- ///
381
- /// Returns None if the extractor doesn't support synchronous extraction.
382
- /// This is used for WASM and other sync-only environments.
383
- fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
384
- None
385
- }
386
362
  }
387
363
 
364
+ // Public registration APIs
365
+
388
366
  /// Register a document extractor with the global registry.
389
367
  ///
390
368
  /// The extractor will be registered for all MIME types it supports and will be
@@ -436,7 +414,6 @@ pub trait DocumentExtractor: Plugin {
436
414
  /// detected_languages: None,
437
415
  /// chunks: None,
438
416
  /// images: None,
439
- /// pages: None,
440
417
  /// })
441
418
  /// }
442
419
  ///
@@ -561,7 +538,6 @@ pub fn clear_extractors() -> crate::Result<()> {
561
538
  #[cfg(test)]
562
539
  mod tests {
563
540
  use super::*;
564
- use serial_test::serial;
565
541
 
566
542
  struct MockExtractor {
567
543
  mime_types: Vec<&'static str>,
@@ -602,7 +578,6 @@ mod tests {
602
578
  detected_languages: None,
603
579
  chunks: None,
604
580
  images: None,
605
- pages: None,
606
581
  })
607
582
  }
608
583
 
@@ -775,7 +750,6 @@ mod tests {
775
750
  detected_languages: None,
776
751
  chunks: None,
777
752
  images: None,
778
- pages: None,
779
753
  })
780
754
  }
781
755
 
@@ -856,8 +830,9 @@ mod tests {
856
830
  assert_eq!(result.mime_type, "application/json");
857
831
  }
858
832
 
833
+ // Tests for public registration APIs
834
+
859
835
  #[test]
860
- #[serial]
861
836
  fn test_register_extractor() {
862
837
  use std::sync::Arc;
863
838
 
@@ -872,7 +847,6 @@ mod tests {
872
847
  }
873
848
 
874
849
  #[test]
875
- #[serial]
876
850
  fn test_unregister_extractor() {
877
851
  use std::sync::Arc;
878
852
 
@@ -887,14 +861,12 @@ mod tests {
887
861
  }
888
862
 
889
863
  #[test]
890
- #[serial]
891
864
  fn test_unregister_nonexistent_extractor() {
892
865
  let result = super::unregister_extractor("nonexistent-extractor-xyz");
893
866
  assert!(result.is_ok());
894
867
  }
895
868
 
896
869
  #[test]
897
- #[serial]
898
870
  fn test_list_extractors() {
899
871
  use std::sync::Arc;
900
872
 
@@ -916,6 +888,7 @@ mod tests {
916
888
  super::register_extractor(extractor2).unwrap();
917
889
 
918
890
  let list = super::list_extractors().unwrap();
891
+ // Both extractors have the same name, so only one will be registered
919
892
  assert_eq!(list.len(), 1);
920
893
  assert!(list.contains(&"mock-extractor".to_string()));
921
894
 
@@ -923,7 +896,6 @@ mod tests {
923
896
  }
924
897
 
925
898
  #[test]
926
- #[serial]
927
899
  fn test_clear_extractors() {
928
900
  use std::sync::Arc;
929
901
 
@@ -949,7 +921,6 @@ mod tests {
949
921
  }
950
922
 
951
923
  #[test]
952
- #[serial]
953
924
  fn test_register_extractor_with_invalid_name() {
954
925
  use std::sync::Arc;
955
926
 
@@ -980,7 +951,6 @@ mod tests {
980
951
  detected_languages: None,
981
952
  chunks: None,
982
953
  images: None,
983
- pages: None,
984
954
  })
985
955
  }
986
956
 
@@ -995,7 +965,6 @@ mod tests {
995
965
  }
996
966
 
997
967
  #[test]
998
- #[serial]
999
968
  fn test_register_extractor_with_empty_name() {
1000
969
  use std::sync::Arc;
1001
970
 
@@ -1026,7 +995,6 @@ mod tests {
1026
995
  detected_languages: None,
1027
996
  chunks: None,
1028
997
  images: None,
1029
- pages: None,
1030
998
  })
1031
999
  }
1032
1000