kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,5 +1,3 @@
1
- #![cfg(all(feature = "tokio-runtime", feature = "office"))]
2
-
3
1
  //! PowerPoint presentation extractor.
4
2
 
5
3
  use crate::Result;
@@ -45,10 +43,8 @@ impl PptxExtractor {
45
43
  for image in &mut images {
46
44
  let image_data = image.data.clone();
47
45
  let tess_config_clone = tess_config.clone();
48
- let span = tracing::Span::current();
49
46
 
50
47
  let ocr_result = tokio::task::spawn_blocking(move || {
51
- let _guard = span.entered();
52
48
  let cache_dir = std::env::var("KREUZBERG_CACHE_DIR").ok().map(std::path::PathBuf::from);
53
49
 
54
50
  let proc = OcrProcessor::new(cache_dir)?;
@@ -71,7 +67,6 @@ impl PptxExtractor {
71
67
  detected_languages: None,
72
68
  chunks: None,
73
69
  images: None,
74
- pages: None,
75
70
  };
76
71
  image.ocr_result = Some(Box::new(extraction_result));
77
72
  }
@@ -105,13 +100,6 @@ impl Plugin for PptxExtractor {
105
100
 
106
101
  #[async_trait]
107
102
  impl DocumentExtractor for PptxExtractor {
108
- #[cfg_attr(feature = "otel", tracing::instrument(
109
- skip(self, content, config),
110
- fields(
111
- extractor.name = self.name(),
112
- content.size_bytes = content.len(),
113
- )
114
- ))]
115
103
  async fn extract_bytes(
116
104
  &self,
117
105
  content: &[u8],
@@ -120,18 +108,18 @@ impl DocumentExtractor for PptxExtractor {
120
108
  ) -> Result<ExtractionResult> {
121
109
  let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
122
110
 
123
- let pages_config = config.pages.clone();
111
+ // Extract PPTX content
124
112
  let pptx_result = if crate::core::batch_mode::is_batch_mode() {
113
+ // Batch mode: Use spawn_blocking for parallelism
125
114
  let content_owned = content.to_vec();
126
- let span = tracing::Span::current();
127
115
  tokio::task::spawn_blocking(move || {
128
- let _guard = span.entered();
129
- crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images, pages_config.as_ref())
116
+ crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images)
130
117
  })
131
118
  .await
132
119
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("PPTX extraction task failed: {}", e)))??
133
120
  } else {
134
- crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images, config.pages.as_ref())?
121
+ // Single-file mode: Direct extraction (no spawn overhead)
122
+ crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images)?
135
123
  };
136
124
 
137
125
  let mut additional = std::collections::HashMap::new();
@@ -139,41 +127,28 @@ impl DocumentExtractor for PptxExtractor {
139
127
  additional.insert("image_count".to_string(), serde_json::json!(pptx_result.image_count));
140
128
  additional.insert("table_count".to_string(), serde_json::json!(pptx_result.table_count));
141
129
 
142
- let images = if extract_images {
143
- // Image extraction is enabled, return images or empty vector
144
- if !pptx_result.images.is_empty() {
145
- #[cfg(feature = "ocr")]
146
- {
147
- let processed_images = self.process_images_with_ocr(pptx_result.images, config).await?;
148
- Some(processed_images)
149
- }
150
- #[cfg(not(feature = "ocr"))]
151
- {
152
- Some(pptx_result.images)
153
- }
154
- } else {
155
- Some(vec![])
130
+ let images = if !pptx_result.images.is_empty() {
131
+ #[cfg(feature = "ocr")]
132
+ {
133
+ let processed_images = self.process_images_with_ocr(pptx_result.images, config).await?;
134
+ Some(processed_images)
135
+ }
136
+ #[cfg(not(feature = "ocr"))]
137
+ {
138
+ Some(pptx_result.images)
156
139
  }
157
140
  } else {
158
- // Image extraction is disabled
159
141
  None
160
142
  };
161
143
 
162
- let mut metadata = Metadata {
163
- format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
164
- additional,
165
- ..Default::default()
166
- };
167
-
168
- if let Some(page_structure) = pptx_result.page_structure {
169
- metadata.pages = Some(page_structure);
170
- }
171
-
172
144
  Ok(ExtractionResult {
173
145
  content: pptx_result.content,
174
146
  mime_type: mime_type.to_string(),
175
- metadata,
176
- pages: pptx_result.page_contents,
147
+ metadata: Metadata {
148
+ format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
149
+ additional,
150
+ ..Default::default()
151
+ },
177
152
  tables: vec![],
178
153
  detected_languages: None,
179
154
  chunks: None,
@@ -181,12 +156,6 @@ impl DocumentExtractor for PptxExtractor {
181
156
  })
182
157
  }
183
158
 
184
- #[cfg_attr(feature = "otel", tracing::instrument(
185
- skip(self, path, config),
186
- fields(
187
- extractor.name = self.name(),
188
- )
189
- ))]
190
159
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
191
160
  let path_str = path
192
161
  .to_str()
@@ -194,49 +163,35 @@ impl DocumentExtractor for PptxExtractor {
194
163
 
195
164
  let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
196
165
 
197
- let pptx_result =
198
- crate::extraction::pptx::extract_pptx_from_path(path_str, extract_images, config.pages.as_ref())?;
166
+ let pptx_result = crate::extraction::pptx::extract_pptx_from_path(path_str, extract_images)?;
199
167
 
200
168
  let mut additional = std::collections::HashMap::new();
201
169
  additional.insert("slide_count".to_string(), serde_json::json!(pptx_result.slide_count));
202
170
  additional.insert("image_count".to_string(), serde_json::json!(pptx_result.image_count));
203
171
  additional.insert("table_count".to_string(), serde_json::json!(pptx_result.table_count));
204
172
 
205
- let images = if extract_images {
206
- // Image extraction is enabled, return images or empty vector
207
- if !pptx_result.images.is_empty() {
208
- #[cfg(feature = "ocr")]
209
- {
210
- let processed_images = self.process_images_with_ocr(pptx_result.images, config).await?;
211
- Some(processed_images)
212
- }
213
- #[cfg(not(feature = "ocr"))]
214
- {
215
- Some(pptx_result.images)
216
- }
217
- } else {
218
- Some(vec![])
173
+ let images = if !pptx_result.images.is_empty() {
174
+ #[cfg(feature = "ocr")]
175
+ {
176
+ let processed_images = self.process_images_with_ocr(pptx_result.images, config).await?;
177
+ Some(processed_images)
178
+ }
179
+ #[cfg(not(feature = "ocr"))]
180
+ {
181
+ Some(pptx_result.images)
219
182
  }
220
183
  } else {
221
- // Image extraction is disabled
222
184
  None
223
185
  };
224
186
 
225
- let mut metadata = Metadata {
226
- format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
227
- additional,
228
- ..Default::default()
229
- };
230
-
231
- if let Some(page_structure) = pptx_result.page_structure {
232
- metadata.pages = Some(page_structure);
233
- }
234
-
235
187
  Ok(ExtractionResult {
236
188
  content: pptx_result.content,
237
189
  mime_type: mime_type.to_string(),
238
- metadata,
239
- pages: pptx_result.page_contents,
190
+ metadata: Metadata {
191
+ format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
192
+ additional,
193
+ ..Default::default()
194
+ },
240
195
  tables: vec![],
241
196
  detected_languages: None,
242
197
  chunks: None,
@@ -5,7 +5,6 @@ use crate::core::config::ExtractionConfig;
5
5
  use crate::plugins::{DocumentExtractor, Plugin};
6
6
  use crate::types::{ExtractionResult, Metadata};
7
7
  use async_trait::async_trait;
8
- #[cfg(feature = "tokio-runtime")]
9
8
  use std::path::Path;
10
9
 
11
10
  /// Structured data extractor supporting JSON, YAML, and TOML.
@@ -43,13 +42,6 @@ impl Plugin for StructuredExtractor {
43
42
 
44
43
  #[async_trait]
45
44
  impl DocumentExtractor for StructuredExtractor {
46
- #[cfg_attr(feature = "otel", tracing::instrument(
47
- skip(self, content, _config),
48
- fields(
49
- extractor.name = self.name(),
50
- content.size_bytes = content.len(),
51
- )
52
- ))]
53
45
  async fn extract_bytes(
54
46
  &self,
55
47
  content: &[u8],
@@ -81,7 +73,6 @@ impl DocumentExtractor for StructuredExtractor {
81
73
  additional,
82
74
  ..Default::default()
83
75
  },
84
- pages: None,
85
76
  tables: vec![],
86
77
  detected_languages: None,
87
78
  chunks: None,
@@ -89,13 +80,6 @@ impl DocumentExtractor for StructuredExtractor {
89
80
  })
90
81
  }
91
82
 
92
- #[cfg(feature = "tokio-runtime")]
93
- #[cfg_attr(feature = "otel", tracing::instrument(
94
- skip(self, path, config),
95
- fields(
96
- extractor.name = self.name(),
97
- )
98
- ))]
99
83
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
100
84
  let bytes = tokio::fs::read(path).await?;
101
85
  self.extract_bytes(&bytes, mime_type, config).await
@@ -53,40 +53,28 @@ impl Plugin for PlainTextExtractor {
53
53
 
54
54
  #[async_trait]
55
55
  impl DocumentExtractor for PlainTextExtractor {
56
- #[cfg_attr(feature = "otel", tracing::instrument(
57
- skip(self, content, _config),
58
- fields(
59
- extractor.name = self.name(),
60
- content.size_bytes = content.len(),
61
- )
62
- ))]
63
56
  async fn extract_bytes(
64
57
  &self,
65
58
  content: &[u8],
66
59
  mime_type: &str,
67
60
  _config: &ExtractionConfig,
68
61
  ) -> Result<ExtractionResult> {
69
- let text = String::from_utf8_lossy(content).into_owned();
70
- let text = text.trim_end_matches('\n').trim_end_matches('\r').to_string();
71
- let line_count = text.lines().count();
72
- let word_count = text.split_whitespace().count();
73
- let character_count = text.len();
62
+ let text_result = parse_text(content, false)?;
74
63
 
75
64
  Ok(ExtractionResult {
76
- content: text,
65
+ content: text_result.content,
77
66
  mime_type: mime_type.to_string(),
78
67
  metadata: crate::types::Metadata {
79
68
  format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
80
- line_count,
81
- word_count,
82
- character_count,
69
+ line_count: text_result.line_count,
70
+ word_count: text_result.word_count,
71
+ character_count: text_result.character_count,
83
72
  headers: None,
84
73
  links: None,
85
74
  code_blocks: None,
86
75
  })),
87
76
  ..Default::default()
88
77
  },
89
- pages: None,
90
78
  tables: vec![],
91
79
  detected_languages: None,
92
80
  chunks: None,
@@ -95,7 +83,7 @@ impl DocumentExtractor for PlainTextExtractor {
95
83
  }
96
84
 
97
85
  fn supported_mime_types(&self) -> &[&str] {
98
- &["text/plain", "text/csv", "text/tab-separated-values"]
86
+ &["text/plain"]
99
87
  }
100
88
 
101
89
  fn priority(&self) -> i32 {
@@ -150,13 +138,6 @@ impl Plugin for MarkdownExtractor {
150
138
 
151
139
  #[async_trait]
152
140
  impl DocumentExtractor for MarkdownExtractor {
153
- #[cfg_attr(feature = "otel", tracing::instrument(
154
- skip(self, content, _config),
155
- fields(
156
- extractor.name = self.name(),
157
- content.size_bytes = content.len(),
158
- )
159
- ))]
160
141
  async fn extract_bytes(
161
142
  &self,
162
143
  content: &[u8],
@@ -179,7 +160,6 @@ impl DocumentExtractor for MarkdownExtractor {
179
160
  })),
180
161
  ..Default::default()
181
162
  },
182
- pages: None,
183
163
  tables: vec![],
184
164
  detected_languages: None,
185
165
  chunks: None,
@@ -247,10 +227,7 @@ mod tests {
247
227
  let extractor = PlainTextExtractor::new();
248
228
  assert_eq!(extractor.name(), "plain-text-extractor");
249
229
  assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
250
- assert_eq!(
251
- extractor.supported_mime_types(),
252
- &["text/plain", "text/csv", "text/tab-separated-values"]
253
- );
230
+ assert_eq!(extractor.supported_mime_types(), &["text/plain"]);
254
231
  assert_eq!(extractor.priority(), 50);
255
232
  }
256
233
 
@@ -3,7 +3,6 @@
3
3
  use crate::Result;
4
4
  use crate::core::config::ExtractionConfig;
5
5
  use crate::extraction::xml::parse_xml;
6
- use crate::extractors::SyncExtractor;
7
6
  use crate::plugins::{DocumentExtractor, Plugin};
8
7
  use crate::types::ExtractionResult;
9
8
  use async_trait::async_trait;
@@ -52,8 +51,14 @@ impl Plugin for XmlExtractor {
52
51
  }
53
52
  }
54
53
 
55
- impl SyncExtractor for XmlExtractor {
56
- fn extract_sync(&self, content: &[u8], mime_type: &str, _config: &ExtractionConfig) -> Result<ExtractionResult> {
54
+ #[async_trait]
55
+ impl DocumentExtractor for XmlExtractor {
56
+ async fn extract_bytes(
57
+ &self,
58
+ content: &[u8],
59
+ mime_type: &str,
60
+ _config: &ExtractionConfig,
61
+ ) -> Result<ExtractionResult> {
57
62
  let xml_result = parse_xml(content, false)?;
58
63
 
59
64
  Ok(ExtractionResult {
@@ -70,28 +75,8 @@ impl SyncExtractor for XmlExtractor {
70
75
  detected_languages: None,
71
76
  chunks: None,
72
77
  images: None,
73
- pages: None,
74
78
  })
75
79
  }
76
- }
77
-
78
- #[async_trait]
79
- impl DocumentExtractor for XmlExtractor {
80
- #[cfg_attr(feature = "otel", tracing::instrument(
81
- skip(self, content, config),
82
- fields(
83
- extractor.name = self.name(),
84
- content.size_bytes = content.len(),
85
- )
86
- ))]
87
- async fn extract_bytes(
88
- &self,
89
- content: &[u8],
90
- mime_type: &str,
91
- config: &ExtractionConfig,
92
- ) -> Result<ExtractionResult> {
93
- self.extract_sync(content, mime_type, config)
94
- }
95
80
 
96
81
  fn supported_mime_types(&self) -> &[&str] {
97
82
  &["application/xml", "text/xml", "image/svg+xml"]
@@ -100,10 +85,6 @@ impl DocumentExtractor for XmlExtractor {
100
85
  fn priority(&self) -> i32 {
101
86
  50
102
87
  }
103
-
104
- fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
105
- Some(self)
106
- }
107
88
  }
108
89
 
109
90
  #[cfg(test)]
@@ -45,8 +45,7 @@ impl Plugin for KeywordExtractor {
45
45
  }
46
46
  }
47
47
 
48
- #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
49
- #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
48
+ #[async_trait]
50
49
  impl PostProcessor for KeywordExtractor {
51
50
  async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
52
51
  let keyword_config = match &config.keywords {
@@ -113,7 +112,6 @@ machine learning that uses neural networks with multiple layers.
113
112
  detected_languages: None,
114
113
  chunks: None,
115
114
  images: None,
116
- pages: None,
117
115
  };
118
116
 
119
117
  processor.process(&mut result, &config).await.unwrap();
@@ -142,7 +140,6 @@ machine learning that uses neural networks with multiple layers.
142
140
  detected_languages: None,
143
141
  chunks: None,
144
142
  images: None,
145
- pages: None,
146
143
  };
147
144
 
148
145
  processor.process(&mut result, &config).await.unwrap();
@@ -167,7 +164,6 @@ machine learning that uses neural networks with multiple layers.
167
164
  detected_languages: None,
168
165
  chunks: None,
169
166
  images: None,
170
- pages: None,
171
167
  };
172
168
 
173
169
  processor.process(&mut result, &config).await.unwrap();
@@ -192,7 +188,6 @@ machine learning that uses neural networks with multiple layers.
192
188
  detected_languages: None,
193
189
  chunks: None,
194
190
  images: None,
195
- pages: None,
196
191
  };
197
192
 
198
193
  processor.process(&mut result, &config).await.unwrap();
@@ -228,7 +223,6 @@ machine learning that uses neural networks with multiple layers.
228
223
  detected_languages: None,
229
224
  chunks: None,
230
225
  images: None,
231
- pages: None,
232
226
  };
233
227
 
234
228
  let config_with_keywords = ExtractionConfig {
@@ -253,7 +247,6 @@ machine learning that uses neural networks with multiple layers.
253
247
  detected_languages: None,
254
248
  chunks: None,
255
249
  images: None,
256
- pages: None,
257
250
  };
258
251
 
259
252
  let long_result = ExtractionResult {
@@ -264,7 +257,6 @@ machine learning that uses neural networks with multiple layers.
264
257
  detected_languages: None,
265
258
  chunks: None,
266
259
  images: None,
267
- pages: None,
268
260
  };
269
261
 
270
262
  let short_duration = processor.estimated_duration_ms(&short_result);
@@ -248,6 +248,7 @@ mod tests {
248
248
  let english_text = "Natural language processing is a subfield of artificial intelligence.";
249
249
  let config = KeywordConfig::rake().with_language("fr");
250
250
  let keywords = extract_keywords_rake(english_text, &config).unwrap();
251
+ dbg!(&keywords);
251
252
  assert!(
252
253
  !keywords.is_empty(),
253
254
  "Should fall back to English stopwords and extract keywords"
@@ -4,13 +4,8 @@
4
4
 
5
5
  use crate::Result;
6
6
  use crate::core::config::LanguageDetectionConfig;
7
- use once_cell::sync::Lazy;
8
- use std::sync::Arc;
9
7
  use whatlang::{Lang, detect};
10
8
 
11
- pub mod processor;
12
- pub use processor::LanguageDetector;
13
-
14
9
  /// Detect languages in text using whatlang.
15
10
  ///
16
11
  /// Returns a list of detected language codes (ISO 639-3 format).
@@ -185,44 +180,6 @@ fn lang_to_iso639_3(lang: Lang) -> String {
185
180
  .to_string()
186
181
  }
187
182
 
188
- /// Register the language detection processor with the global registry.
189
- ///
190
- /// This function should be called once at application startup to register
191
- /// the language detection post-processor.
192
- ///
193
- /// **Note:** This is called automatically on first use.
194
- /// Explicit calling is optional.
195
- pub fn register_language_detection_processor() -> Result<()> {
196
- let registry = crate::plugins::registry::get_post_processor_registry();
197
- let mut registry = registry
198
- .write()
199
- .map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
200
-
201
- registry.register(Arc::new(LanguageDetector), 40)?;
202
-
203
- Ok(())
204
- }
205
-
206
- /// Lazy-initialized flag that ensures language detection processor is registered exactly once.
207
- ///
208
- /// This static is accessed on first use to automatically register the
209
- /// language detection processor with the plugin registry.
210
- static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_language_detection_processor);
211
-
212
- /// Ensure the language detection processor is registered.
213
- ///
214
- /// This function is called automatically when needed.
215
- /// It's safe to call multiple times - registration only happens once.
216
- pub fn ensure_initialized() -> Result<()> {
217
- PROCESSOR_INITIALIZED
218
- .as_ref()
219
- .map(|_| ())
220
- .map_err(|e| crate::KreuzbergError::Plugin {
221
- message: format!("Failed to register language detection processor: {}", e),
222
- plugin_name: "language-detection".to_string(),
223
- })
224
- }
225
-
226
183
  #[cfg(test)]
227
184
  mod tests {
228
185
  use super::*;
@@ -719,57 +676,6 @@ mod tests {
719
676
  assert_eq!(langs[0], "eng");
720
677
  }
721
678
 
722
- #[test]
723
- fn test_medical_terminology() {
724
- let text = "The patient presented with acute myocardial infarction and was administered thrombolytic therapy. \
725
- The electrocardiogram showed significant ST-segment elevation in the anterior leads. \
726
- Cardiac biomarkers including troponin and creatine kinase were significantly elevated.";
727
- let config = LanguageDetectionConfig {
728
- enabled: true,
729
- min_confidence: 0.5,
730
- detect_multiple: false,
731
- };
732
-
733
- let result = detect_languages(text, &config).unwrap();
734
- assert!(result.is_some());
735
- let langs = result.unwrap();
736
- assert_eq!(langs[0], "eng");
737
- }
738
-
739
- #[test]
740
- fn test_legal_terminology() {
741
- let text = "The plaintiff hereby alleges that the defendant breached the contractual obligations as stipulated in the aforementioned agreement. \
742
- Pursuant to clause 5.2, the defendant was required to provide adequate consideration within thirty days of execution. \
743
- The court finds that the preponderance of evidence supports the plaintiff's claims.";
744
- let config = LanguageDetectionConfig {
745
- enabled: true,
746
- min_confidence: 0.5,
747
- detect_multiple: false,
748
- };
749
-
750
- let result = detect_languages(text, &config).unwrap();
751
- assert!(result.is_some());
752
- let langs = result.unwrap();
753
- assert_eq!(langs[0], "eng");
754
- }
755
-
756
- #[test]
757
- fn test_scientific_terminology() {
758
- let text = "The experimental protocol involved spectrophotometric analysis using ultraviolet-visible spectroscopy. \
759
- Quantum mechanical calculations were performed using density functional theory at the B3LYP level. \
760
- The results demonstrated significant correlation between molecular structure and optical properties.";
761
- let config = LanguageDetectionConfig {
762
- enabled: true,
763
- min_confidence: 0.5,
764
- detect_multiple: false,
765
- };
766
-
767
- let result = detect_languages(text, &config).unwrap();
768
- assert!(result.is_some());
769
- let langs = result.unwrap();
770
- assert_eq!(langs[0], "eng");
771
- }
772
-
773
679
  #[test]
774
680
  fn test_code_with_comments() {
775
681
  let text = r#"
@@ -845,6 +751,57 @@ mod tests {
845
751
  assert_eq!(langs[0], "eng");
846
752
  }
847
753
 
754
+ #[test]
755
+ fn test_medical_terminology() {
756
+ let text = "The patient presented with acute myocardial infarction and was administered thrombolytic therapy. \
757
+ The electrocardiogram showed significant ST-segment elevation in the anterior leads. \
758
+ Cardiac biomarkers including troponin and creatine kinase were significantly elevated.";
759
+ let config = LanguageDetectionConfig {
760
+ enabled: true,
761
+ min_confidence: 0.5,
762
+ detect_multiple: false,
763
+ };
764
+
765
+ let result = detect_languages(text, &config).unwrap();
766
+ assert!(result.is_some());
767
+ let langs = result.unwrap();
768
+ assert_eq!(langs[0], "eng");
769
+ }
770
+
771
+ #[test]
772
+ fn test_legal_terminology() {
773
+ let text = "The plaintiff hereby alleges that the defendant breached the contractual obligations as stipulated in the aforementioned agreement. \
774
+ Pursuant to clause 5.2, the defendant was required to provide adequate consideration within thirty days of execution. \
775
+ The court finds that the preponderance of evidence supports the plaintiff's claims.";
776
+ let config = LanguageDetectionConfig {
777
+ enabled: true,
778
+ min_confidence: 0.5,
779
+ detect_multiple: false,
780
+ };
781
+
782
+ let result = detect_languages(text, &config).unwrap();
783
+ assert!(result.is_some());
784
+ let langs = result.unwrap();
785
+ assert_eq!(langs[0], "eng");
786
+ }
787
+
788
+ #[test]
789
+ fn test_scientific_terminology() {
790
+ let text = "The experimental protocol involved spectrophotometric analysis using ultraviolet-visible spectroscopy. \
791
+ Quantum mechanical calculations were performed using density functional theory at the B3LYP level. \
792
+ The results demonstrated significant correlation between molecular structure and optical properties.";
793
+ let config = LanguageDetectionConfig {
794
+ enabled: true,
795
+ min_confidence: 0.5,
796
+ detect_multiple: false,
797
+ };
798
+
799
+ let result = detect_languages(text, &config).unwrap();
800
+ assert!(result.is_some());
801
+ let langs = result.unwrap();
802
+ assert_eq!(langs[0], "eng");
803
+ }
804
+
848
805
  #[test]
849
806
  fn test_latin_cyrillic_mix() {
850
807
  let text = format!(
@@ -39,10 +39,11 @@ pub mod core;
39
39
  pub mod error;
40
40
  pub mod extraction;
41
41
  pub mod extractors;
42
- pub mod panic_context;
43
42
  pub mod plugins;
44
43
  pub mod text;
45
44
  pub mod types;
45
+
46
+ #[cfg(feature = "quality")]
46
47
  pub mod utils;
47
48
 
48
49
  #[cfg(feature = "api")]
@@ -78,34 +79,21 @@ pub mod pdf;
78
79
  pub use error::{KreuzbergError, Result};
79
80
  pub use types::*;
80
81
 
81
- #[cfg(feature = "tokio-runtime")]
82
- pub use core::extractor::{batch_extract_bytes, batch_extract_file};
83
- pub use core::extractor::{extract_bytes, extract_file};
84
-
85
- pub use core::extractor::{batch_extract_bytes_sync, extract_bytes_sync};
82
+ pub use core::extractor::{batch_extract_bytes, batch_extract_file, extract_bytes, extract_file};
86
83
 
87
- #[cfg(feature = "tokio-runtime")]
88
- pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
84
+ pub use core::extractor::{batch_extract_bytes_sync, batch_extract_file_sync, extract_bytes_sync, extract_file_sync};
89
85
 
90
86
  pub use core::config::{
91
87
  ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
92
- LanguageDetectionConfig, OcrConfig, PostProcessorConfig, TokenReductionConfig,
88
+ LanguageDetectionConfig, OcrConfig, PdfConfig, PostProcessorConfig, TokenReductionConfig,
93
89
  };
94
90
 
95
- #[cfg(feature = "api")]
96
- pub use core::server_config::ServerConfig;
97
-
98
- #[cfg(feature = "pdf")]
99
- pub use core::config::PdfConfig;
100
-
101
91
  pub use core::mime::{
102
92
  DOCX_MIME_TYPE, EXCEL_MIME_TYPE, HTML_MIME_TYPE, JSON_MIME_TYPE, MARKDOWN_MIME_TYPE, PDF_MIME_TYPE,
103
93
  PLAIN_TEXT_MIME_TYPE, POWER_POINT_MIME_TYPE, XML_MIME_TYPE, detect_mime_type, detect_mime_type_from_bytes,
104
94
  detect_or_validate, get_extensions_for_mime, validate_mime_type,
105
95
  };
106
96
 
107
- pub use core::formats::{KNOWN_FORMATS, is_valid_format_field};
108
-
109
97
  pub use plugins::registry::{
110
98
  get_document_extractor_registry, get_ocr_backend_registry, get_post_processor_registry, get_validator_registry,
111
99
  };