kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -11,30 +11,6 @@
11
11
  //! - Batch processing for efficient embedding generation
12
12
  //! - Optional GPU acceleration via ONNX Runtime execution providers
13
13
  //!
14
- //! # ONNX Runtime Requirement
15
- //!
16
- //! **CRITICAL**: This module requires ONNX Runtime to be installed on the system.
17
- //! The `embeddings` feature uses dynamic loading (`ort-load-dynamic`), which detects
18
- //! the ONNX Runtime library at runtime.
19
- //!
20
- //! ## Installation Instructions
21
- //!
22
- //! - **macOS**: `brew install onnxruntime`
23
- //! - **Linux (Ubuntu/Debian)**: `apt install libonnxruntime libonnxruntime-dev`
24
- //! - **Linux (Fedora)**: `dnf install onnxruntime onnxruntime-devel`
25
- //! - **Linux (Arch)**: `pacman -S onnxruntime`
26
- //! - **Windows (MSVC)**: Download from https://github.com/microsoft/onnxruntime/releases and add to PATH
27
- //!
28
- //! Alternatively, set the `ORT_DYLIB_PATH` environment variable to the ONNX Runtime library path.
29
- //!
30
- //! For Docker/containers, install via package manager in your base image.
31
- //! Verified packages: Ubuntu 22.04+, Fedora 38+, Arch Linux.
32
- //!
33
- //! ## Platform Limitations
34
- //!
35
- //! **Windows MinGW builds are not supported**. ONNX Runtime requires the MSVC toolchain on Windows.
36
- //! Please use Windows MSVC builds or disable the embeddings feature.
37
- //!
38
14
  //! # Example
39
15
  //!
40
16
  //! ```rust,ignore
@@ -67,107 +43,11 @@ use std::sync::{Arc, Mutex, RwLock};
67
43
  use std::collections::HashMap;
68
44
 
69
45
  #[cfg(feature = "embeddings")]
70
- use std::mem::ManuallyDrop;
46
+ use lazy_static::lazy_static;
71
47
 
72
48
  #[cfg(feature = "embeddings")]
73
- use once_cell::sync::Lazy;
74
-
75
- /// Wrapper for TextEmbedding that prevents cleanup during process shutdown.
76
- ///
77
- /// # Problem
78
- ///
79
- /// ONNX Runtime's C++ destructors fail during process shutdown when trying to
80
- /// acquire mutexes that have already been torn down by the C++ runtime. This
81
- /// causes crashes with "mutex lock failed: Invalid argument" errors.
82
- ///
83
- /// This is a known issue in `ort` v2.0.0-rc.10 (pykeio/ort#441) that was fixed
84
- /// in later versions, but we're constrained by fastembed's dependency tree.
85
- ///
86
- /// # Solution
87
- ///
88
- /// We prevent all cleanup of ONNX Runtime resources:
89
- /// 1. Individual TextEmbedding objects are leaked via Box::leak
90
- /// 2. The entire MODEL_CACHE is wrapped in ManuallyDrop
91
- ///
92
- /// This prevents Drop implementations from running during shutdown, completely
93
- /// avoiding the mutex errors. The OS reclaims all memory on process exit anyway.
94
- ///
95
- /// Thread-safe wrapper for leaked TextEmbedding that allows interior mutability.
96
- ///
97
- /// This wrapper holds a raw pointer to a leaked `TextEmbedding` and provides
98
- /// safe access through the Mutex lock in MODEL_CACHE.
99
- #[cfg(feature = "embeddings")]
100
- pub(crate) struct LeakedModel {
101
- ptr: *mut TextEmbedding,
102
- }
103
-
104
- #[cfg(feature = "embeddings")]
105
- impl LeakedModel {
106
- fn new(model: TextEmbedding) -> Self {
107
- Self {
108
- ptr: Box::into_raw(Box::new(model)),
109
- }
110
- }
111
-
112
- /// Get a mutable reference to the model.
113
- ///
114
- /// # Safety
115
- ///
116
- /// This is safe to call only when:
117
- /// 1. The caller has exclusive access (guaranteed by Mutex in MODEL_CACHE)
118
- /// 2. The pointer is valid (guaranteed by Box::into_raw and never deallocating)
119
- #[allow(unsafe_code, clippy::mut_from_ref)]
120
- unsafe fn get_mut(&self) -> &mut TextEmbedding {
121
- unsafe { &mut *self.ptr }
122
- }
123
- }
124
-
125
- #[cfg(feature = "embeddings")]
126
- #[allow(unsafe_code)]
127
- unsafe impl Send for LeakedModel {}
128
- #[cfg(feature = "embeddings")]
129
- #[allow(unsafe_code)]
130
- unsafe impl Sync for LeakedModel {}
131
-
132
- #[cfg(feature = "embeddings")]
133
- type CachedEmbedding = Arc<Mutex<LeakedModel>>;
134
-
135
- /// Global model cache wrapped in ManuallyDrop to prevent cleanup during process exit.
136
- ///
137
- /// We use Lazy + ManuallyDrop because ONNX Runtime's C++ destructors fail during static
138
- /// destruction when mutexes are already torn down. By never dropping this cache,
139
- /// we avoid the mutex errors at shutdown. The OS reclaims memory on process exit anyway.
140
- #[cfg(feature = "embeddings")]
141
- static MODEL_CACHE: Lazy<ManuallyDrop<RwLock<HashMap<String, CachedEmbedding>>>> =
142
- Lazy::new(|| ManuallyDrop::new(RwLock::new(HashMap::new())));
143
-
144
- /// Returns installation instructions for ONNX Runtime.
145
- #[cfg(feature = "embeddings")]
146
- fn onnx_runtime_install_message() -> String {
147
- #[cfg(all(windows, target_env = "gnu"))]
148
- {
149
- return "ONNX Runtime embeddings are not supported on Windows MinGW builds. \
150
- ONNX Runtime requires MSVC toolchain. \
151
- Please use Windows MSVC builds or disable embeddings feature."
152
- .to_string();
153
- }
154
-
155
- #[cfg(not(all(windows, target_env = "gnu")))]
156
- {
157
- "ONNX Runtime is required for embeddings functionality. \
158
- Install: \
159
- macOS: 'brew install onnxruntime', \
160
- Linux (Ubuntu/Debian): 'apt install libonnxruntime libonnxruntime-dev', \
161
- Linux (Fedora): 'dnf install onnxruntime onnxruntime-devel', \
162
- Linux (Arch): 'pacman -S onnxruntime', \
163
- Windows (MSVC): Download from https://github.com/microsoft/onnxruntime/releases and add to PATH. \
164
- \
165
- Alternatively, set ORT_DYLIB_PATH environment variable to the ONNX Runtime library path. \
166
- \
167
- For Docker/containers: Install via package manager in your base image. \
168
- Verified packages: Ubuntu 22.04+, Fedora 38+, Arch Linux."
169
- .to_string()
170
- }
49
+ lazy_static! {
50
+ static ref MODEL_CACHE: RwLock<HashMap<String, Arc<Mutex<TextEmbedding>>>> = RwLock::new(HashMap::new());
171
51
  }
172
52
 
173
53
  /// Get or initialize a text embedding model from cache.
@@ -175,11 +55,10 @@ fn onnx_runtime_install_message() -> String {
175
55
  /// This function ensures models are initialized only once and reused across
176
56
  /// the application, avoiding redundant downloads and initialization overhead.
177
57
  #[cfg(feature = "embeddings")]
178
- #[allow(private_interfaces)]
179
58
  pub fn get_or_init_model(
180
59
  model: EmbeddingModel,
181
60
  cache_dir: Option<std::path::PathBuf>,
182
- ) -> crate::Result<CachedEmbedding> {
61
+ ) -> crate::Result<Arc<Mutex<TextEmbedding>>> {
183
62
  let cache_directory = cache_dir.unwrap_or_else(|| {
184
63
  let mut path = std::env::current_dir().unwrap_or_else(|_| std::path::PathBuf::from("."));
185
64
  path.push(".kreuzberg");
@@ -190,26 +69,21 @@ pub fn get_or_init_model(
190
69
  let model_key = format!("{:?}_{}", model, cache_directory.display());
191
70
 
192
71
  {
193
- match MODEL_CACHE.read() {
194
- Ok(cache) => {
195
- if let Some(cached_model) = cache.get(&model_key) {
196
- return Ok(Arc::clone(cached_model));
197
- }
198
- }
199
- Err(poison_error) => {
200
- let cache = poison_error.get_ref();
201
- if let Some(cached_model) = cache.get(&model_key) {
202
- return Ok(Arc::clone(cached_model));
203
- }
204
- }
72
+ let cache = MODEL_CACHE.read().map_err(|e| crate::KreuzbergError::Plugin {
73
+ message: format!("Failed to acquire model cache read lock: {}", e),
74
+ plugin_name: "embeddings".to_string(),
75
+ })?;
76
+
77
+ if let Some(cached_model) = cache.get(&model_key) {
78
+ return Ok(Arc::clone(cached_model));
205
79
  }
206
80
  }
207
81
 
208
82
  {
209
- let mut cache = match MODEL_CACHE.write() {
210
- Ok(guard) => guard,
211
- Err(poison_error) => poison_error.into_inner(),
212
- };
83
+ let mut cache = MODEL_CACHE.write().map_err(|e| crate::KreuzbergError::Plugin {
84
+ message: format!("Failed to acquire model cache write lock: {}", e),
85
+ plugin_name: "embeddings".to_string(),
86
+ })?;
213
87
 
214
88
  if let Some(cached_model) = cache.get(&model_key) {
215
89
  return Ok(Arc::clone(cached_model));
@@ -218,27 +92,12 @@ pub fn get_or_init_model(
218
92
  let mut init_options = InitOptions::new(model);
219
93
  init_options = init_options.with_cache_dir(cache_directory);
220
94
 
221
- let embedding_model = TextEmbedding::try_new(init_options).map_err(|e| {
222
- let error_msg = e.to_string();
223
-
224
- if error_msg.contains("onnxruntime")
225
- || error_msg.contains("ORT")
226
- || error_msg.contains("libonnxruntime")
227
- || error_msg.contains("onnxruntime.dll")
228
- || error_msg.contains("Unable to load")
229
- || error_msg.contains("library load failed")
230
- {
231
- crate::KreuzbergError::MissingDependency(format!("ONNX Runtime - {}", onnx_runtime_install_message()))
232
- } else {
233
- crate::KreuzbergError::Plugin {
234
- message: format!("Failed to initialize embedding model: {}", e),
235
- plugin_name: "embeddings".to_string(),
236
- }
237
- }
95
+ let embedding_model = TextEmbedding::try_new(init_options).map_err(|e| crate::KreuzbergError::Plugin {
96
+ message: format!("Failed to initialize embedding model: {}", e),
97
+ plugin_name: "embeddings".to_string(),
238
98
  })?;
239
99
 
240
- let leaked_model = LeakedModel::new(embedding_model);
241
- let arc_model = Arc::new(Mutex::new(leaked_model));
100
+ let arc_model = Arc::new(Mutex::new(embedding_model));
242
101
  cache.insert(model_key, Arc::clone(&arc_model));
243
102
 
244
103
  Ok(arc_model)
@@ -389,15 +248,12 @@ pub fn generate_embeddings_for_chunks(
389
248
  let texts: Vec<String> = chunks.iter().map(|chunk| chunk.content.clone()).collect();
390
249
 
391
250
  let embeddings_result = {
392
- let locked_model = model.lock().map_err(|e| crate::KreuzbergError::Plugin {
251
+ let mut locked_model = model.lock().map_err(|e| crate::KreuzbergError::Plugin {
393
252
  message: format!("Failed to acquire model lock: {}", e),
394
253
  plugin_name: "embeddings".to_string(),
395
254
  })?;
396
255
 
397
- #[allow(unsafe_code)]
398
- let model_mut = unsafe { locked_model.get_mut() };
399
-
400
- model_mut
256
+ locked_model
401
257
  .embed(texts, Some(config.batch_size))
402
258
  .map_err(|e| crate::KreuzbergError::Plugin {
403
259
  message: format!("Failed to generate embeddings: {}", e),
@@ -464,8 +320,4 @@ mod tests {
464
320
  assert_eq!(quality.chunk_size, 2000);
465
321
  assert_eq!(quality.overlap, 200);
466
322
  }
467
-
468
- #[cfg(feature = "embeddings")]
469
- #[test]
470
- fn test_lock_poisoning_recovery_semantics() {}
471
323
  }
@@ -60,7 +60,7 @@ pub type Result<T> = std::result::Result<T, KreuzbergError>;
60
60
  /// - `Cache` - Cache operation errors (non-fatal, can be ignored)
61
61
  /// - `ImageProcessing` - Image manipulation errors
62
62
  /// - `Serialization` - JSON/MessagePack serialization errors
63
- /// - `MissingDependency` - Missing optional dependencies (tesseract, etc.)
63
+ /// - `MissingDependency` - Missing optional dependencies (tesseract, pandoc, etc.)
64
64
  /// - `Plugin` - Plugin-specific errors
65
65
  /// - `LockPoisoned` - Mutex/RwLock poisoning (should not happen in normal operation)
66
66
  /// - `UnsupportedFormat` - Unsupported MIME type or file format
@@ -177,7 +177,7 @@ impl From<crate::pdf::error::PdfError> for KreuzbergError {
177
177
 
178
178
  macro_rules! error_constructor {
179
179
  ($name:ident, $variant:ident) => {
180
- pastey::paste! {
180
+ paste::paste! {
181
181
  #[doc = "Create a " $variant " error"]
182
182
  pub fn $name<S: Into<String>>(message: S) -> Self {
183
183
  Self::$variant {
@@ -3,7 +3,7 @@
3
3
  //! This module provides functions for extracting file lists and contents from archives.
4
4
 
5
5
  use crate::error::{KreuzbergError, Result};
6
- use sevenz_rust2::{ArchiveReader, Password};
6
+ use sevenz_rust::SevenZReader;
7
7
  use std::collections::HashMap;
8
8
  use std::io::{Cursor, Read};
9
9
  use tar::Archive as TarArchive;
@@ -39,7 +39,7 @@ pub fn extract_zip_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
39
39
  let mut archive =
40
40
  ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
41
41
 
42
- let mut file_list = Vec::with_capacity(archive.len());
42
+ let mut file_list = Vec::new();
43
43
  let mut total_size = 0u64;
44
44
 
45
45
  for i in 0..archive.len() {
@@ -71,8 +71,7 @@ pub fn extract_tar_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
71
71
  let cursor = Cursor::new(bytes);
72
72
  let mut archive = TarArchive::new(cursor);
73
73
 
74
- let estimated_entries = bytes.len().saturating_div(512).max(16);
75
- let mut file_list = Vec::with_capacity(estimated_entries);
74
+ let mut file_list = Vec::new();
76
75
  let mut total_size = 0u64;
77
76
  let mut file_count = 0;
78
77
 
@@ -116,8 +115,7 @@ pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
116
115
  let mut archive =
117
116
  ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
118
117
 
119
- let estimated_text_files = archive.len().saturating_mul(3).saturating_div(10).max(2);
120
- let mut contents = HashMap::with_capacity(estimated_text_files);
118
+ let mut contents = HashMap::new();
121
119
  let text_extensions = [
122
120
  ".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
123
121
  ];
@@ -130,8 +128,7 @@ pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
130
128
  let path = file.name().to_string();
131
129
 
132
130
  if !file.is_dir() && text_extensions.iter().any(|ext| path.to_lowercase().ends_with(ext)) {
133
- let estimated_size = (file.size() as usize).min(10 * 1024 * 1024);
134
- let mut content = String::with_capacity(estimated_size);
131
+ let mut content = String::new();
135
132
  if file.read_to_string(&mut content).is_ok() {
136
133
  contents.insert(path, content);
137
134
  }
@@ -148,8 +145,7 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
148
145
  let cursor = Cursor::new(bytes);
149
146
  let mut archive = TarArchive::new(cursor);
150
147
 
151
- let estimated_text_files = bytes.len().saturating_div(1024 * 10).min(100);
152
- let mut contents = HashMap::with_capacity(estimated_text_files.max(2));
148
+ let mut contents = HashMap::new();
153
149
  let text_extensions = [
154
150
  ".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
155
151
  ];
@@ -170,8 +166,7 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
170
166
 
171
167
  if !entry.header().entry_type().is_dir() && text_extensions.iter().any(|ext| path.to_lowercase().ends_with(ext))
172
168
  {
173
- let estimated_size = (entry.size().min(10 * 1024 * 1024)) as usize;
174
- let mut content = String::with_capacity(estimated_size);
169
+ let mut content = String::new();
175
170
  if entry.read_to_string(&mut content).is_ok() {
176
171
  contents.insert(path, content);
177
172
  }
@@ -184,7 +179,7 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
184
179
  /// Extract metadata from a 7z archive.
185
180
  pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
186
181
  let cursor = Cursor::new(bytes);
187
- let archive = ArchiveReader::new(cursor, Password::empty())
182
+ let archive = SevenZReader::new(cursor, bytes.len() as u64, "".into())
188
183
  .map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
189
184
 
190
185
  let mut file_list = Vec::new();
@@ -217,7 +212,7 @@ pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
217
212
  /// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log
218
213
  pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
219
214
  let cursor = Cursor::new(bytes);
220
- let mut archive = ArchiveReader::new(cursor, Password::empty())
215
+ let mut archive = SevenZReader::new(cursor, bytes.len() as u64, "".into())
221
216
  .map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
222
217
 
223
218
  let mut contents = HashMap::new();
@@ -464,26 +459,26 @@ mod tests {
464
459
 
465
460
  #[test]
466
461
  fn test_extract_7z_metadata_with_files() {
467
- use sevenz_rust2::{ArchiveEntry, ArchiveWriter};
462
+ use sevenz_rust::SevenZWriter;
468
463
 
469
- let cursor = {
470
- let cursor = Cursor::new(Vec::new());
471
- let mut sz = ArchiveWriter::new(cursor).unwrap();
464
+ let mut cursor = Cursor::new(Vec::new());
465
+ {
466
+ let mut sz = SevenZWriter::new(&mut cursor).unwrap();
472
467
 
473
468
  sz.push_archive_entry(
474
- ArchiveEntry::new_file("test.txt"),
469
+ sevenz_rust::SevenZArchiveEntry::from_path("test.txt", "test.txt".to_string()),
475
470
  Some(Cursor::new(b"Hello 7z!".to_vec())),
476
471
  )
477
472
  .unwrap();
478
473
 
479
474
  sz.push_archive_entry(
480
- ArchiveEntry::new_file("data.json"),
475
+ sevenz_rust::SevenZArchiveEntry::from_path("data.json", "data.json".to_string()),
481
476
  Some(Cursor::new(b"{\"key\":\"value\"}".to_vec())),
482
477
  )
483
478
  .unwrap();
484
479
 
485
- sz.finish().unwrap()
486
- };
480
+ sz.finish().unwrap();
481
+ }
487
482
 
488
483
  let bytes = cursor.into_inner();
489
484
  let metadata = extract_7z_metadata(&bytes).unwrap();
@@ -839,26 +834,26 @@ mod tests {
839
834
 
840
835
  #[test]
841
836
  fn test_extract_7z_text_content() {
842
- use sevenz_rust2::{ArchiveEntry, ArchiveWriter};
837
+ use sevenz_rust::SevenZWriter;
843
838
 
844
- let cursor = {
845
- let cursor = Cursor::new(Vec::new());
846
- let mut sz = ArchiveWriter::new(cursor).unwrap();
839
+ let mut cursor = Cursor::new(Vec::new());
840
+ {
841
+ let mut sz = SevenZWriter::new(&mut cursor).unwrap();
847
842
 
848
843
  sz.push_archive_entry(
849
- ArchiveEntry::new_file("test.txt"),
844
+ sevenz_rust::SevenZArchiveEntry::from_path("test.txt", "test.txt".to_string()),
850
845
  Some(Cursor::new(b"Hello 7z text!".to_vec())),
851
846
  )
852
847
  .unwrap();
853
848
 
854
849
  sz.push_archive_entry(
855
- ArchiveEntry::new_file("readme.md"),
850
+ sevenz_rust::SevenZArchiveEntry::from_path("readme.md", "readme.md".to_string()),
856
851
  Some(Cursor::new(b"# 7z README".to_vec())),
857
852
  )
858
853
  .unwrap();
859
854
 
860
- sz.finish().unwrap()
861
- };
855
+ sz.finish().unwrap();
856
+ }
862
857
 
863
858
  let bytes = cursor.into_inner();
864
859
  let contents = extract_7z_text_content(&bytes).unwrap();
@@ -870,13 +865,13 @@ mod tests {
870
865
 
871
866
  #[test]
872
867
  fn test_extract_7z_empty_archive() {
873
- use sevenz_rust2::ArchiveWriter;
868
+ use sevenz_rust::SevenZWriter;
874
869
 
875
- let cursor = {
876
- let cursor = Cursor::new(Vec::new());
877
- let sz = ArchiveWriter::new(cursor).unwrap();
878
- sz.finish().unwrap()
879
- };
870
+ let mut cursor = Cursor::new(Vec::new());
871
+ {
872
+ let sz = SevenZWriter::new(&mut cursor).unwrap();
873
+ sz.finish().unwrap();
874
+ }
880
875
 
881
876
  let bytes = cursor.into_inner();
882
877
  let metadata = extract_7z_metadata(&bytes).unwrap();