kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Configuration validation tests
4
+
3
5
  RSpec.describe 'Configuration Validation' do
4
6
  describe Kreuzberg::Config::Extraction do
5
7
  it 'accepts all valid parameters' do
@@ -234,102 +236,6 @@ RSpec.describe 'Configuration Validation' do
234
236
  end
235
237
  end
236
238
 
237
- describe Kreuzberg::Config::ImagePreprocessing do
238
- it 'has sensible defaults' do
239
- config = described_class.new
240
- expect(config.target_dpi).to eq(300)
241
- expect(config.auto_rotate).to be true
242
- expect(config.deskew).to be true
243
- expect(config.denoise).to be false
244
- expect(config.contrast_enhance).to be true
245
- expect(config.binarization_method).to eq('otsu')
246
- expect(config.invert_colors).to be false
247
- end
248
-
249
- it 'accepts custom values' do
250
- config = described_class.new(
251
- target_dpi: 600,
252
- auto_rotate: false,
253
- deskew: false,
254
- denoise: true,
255
- contrast_enhance: false,
256
- binarization_method: 'sauvola',
257
- invert_colors: true
258
- )
259
- expect(config.target_dpi).to eq(600)
260
- expect(config.auto_rotate).to be false
261
- expect(config.deskew).to be false
262
- expect(config.denoise).to be true
263
- expect(config.contrast_enhance).to be false
264
- expect(config.binarization_method).to eq('sauvola')
265
- expect(config.invert_colors).to be true
266
- end
267
-
268
- it 'validates binarization method via FFI' do
269
- expect { described_class.new(binarization_method: 'otsu') }.not_to raise_error
270
- expect { described_class.new(binarization_method: 'adaptive') }.not_to raise_error
271
- expect { described_class.new(binarization_method: 'sauvola') }.not_to raise_error
272
- end
273
-
274
- it 'rejects invalid binarization methods' do
275
- expect do
276
- described_class.new(binarization_method: 'invalid_method')
277
- end.to raise_error(ArgumentError, /Invalid binarization_method/)
278
- end
279
-
280
- it 'converts to hash correctly' do
281
- config = described_class.new(
282
- target_dpi: 500,
283
- binarization_method: 'adaptive'
284
- )
285
- hash = config.to_h
286
- expect(hash[:target_dpi]).to eq(500)
287
- expect(hash[:binarization_method]).to eq('adaptive')
288
- expect(hash[:auto_rotate]).to be true
289
- end
290
- end
291
-
292
- describe Kreuzberg::Config::TokenReduction do
293
- it 'has sensible defaults' do
294
- config = described_class.new
295
- expect(config.mode).to eq('off')
296
- expect(config.preserve_important_words).to be true
297
- end
298
-
299
- it 'accepts custom values' do
300
- config = described_class.new(
301
- mode: 'aggressive',
302
- preserve_important_words: false
303
- )
304
- expect(config.mode).to eq('aggressive')
305
- expect(config.preserve_important_words).to be false
306
- end
307
-
308
- it 'validates token reduction levels via FFI' do
309
- expect { described_class.new(mode: 'off') }.not_to raise_error
310
- expect { described_class.new(mode: 'light') }.not_to raise_error
311
- expect { described_class.new(mode: 'moderate') }.not_to raise_error
312
- expect { described_class.new(mode: 'aggressive') }.not_to raise_error
313
- expect { described_class.new(mode: 'maximum') }.not_to raise_error
314
- end
315
-
316
- it 'rejects invalid token reduction modes' do
317
- expect do
318
- described_class.new(mode: 'extreme')
319
- end.to raise_error(ArgumentError, /Invalid token reduction mode/)
320
- end
321
-
322
- it 'converts to hash correctly' do
323
- config = described_class.new(
324
- mode: 'light',
325
- preserve_important_words: true
326
- )
327
- hash = config.to_h
328
- expect(hash[:mode]).to eq('light')
329
- expect(hash[:preserve_important_words]).to be true
330
- end
331
- end
332
-
333
239
  describe 'config usage in extraction' do
334
240
  it 'works with OCR config' do
335
241
  path = create_test_file('OCR config test')
@@ -337,7 +243,7 @@ RSpec.describe 'Configuration Validation' do
337
243
  ocr: Kreuzberg::Config::OCR.new(backend: 'tesseract', language: 'eng')
338
244
  )
339
245
 
340
- result = Kreuzberg.extract_file_sync(path: path, config: config)
246
+ result = Kreuzberg.extract_file_sync(path, config: config)
341
247
  expect(result).to be_a(Kreuzberg::Result)
342
248
  end
343
249
 
@@ -347,7 +253,7 @@ RSpec.describe 'Configuration Validation' do
347
253
  chunking: Kreuzberg::Config::Chunking.new(max_chars: 50)
348
254
  )
349
255
 
350
- result = Kreuzberg.extract_file_sync(path: path, config: config)
256
+ result = Kreuzberg.extract_file_sync(path, config: config)
351
257
  expect(result).to be_a(Kreuzberg::Result)
352
258
  end
353
259
 
@@ -357,7 +263,7 @@ RSpec.describe 'Configuration Validation' do
357
263
  language_detection: Kreuzberg::Config::LanguageDetection.new(enabled: true)
358
264
  )
359
265
 
360
- result = Kreuzberg.extract_file_sync(path: path, config: config)
266
+ result = Kreuzberg.extract_file_sync(path, config: config)
361
267
  expect(result).to be_a(Kreuzberg::Result)
362
268
  end
363
269
 
@@ -370,7 +276,7 @@ RSpec.describe 'Configuration Validation' do
370
276
  language_detection: { enabled: false }
371
277
  )
372
278
 
373
- result = Kreuzberg.extract_file_sync(path: path, config: config)
279
+ result = Kreuzberg.extract_file_sync(path, config: config)
374
280
  expect(result).to be_a(Kreuzberg::Result)
375
281
  end
376
282
  end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Error handling and exception mapping tests
4
+
3
5
  RSpec.describe 'Error Handling' do
4
6
  let(:nested_ocr_result) do
5
7
  {
@@ -32,362 +34,163 @@ RSpec.describe 'Error Handling' do
32
34
  }
33
35
  end
34
36
 
35
- describe 'invalid configuration handling' do
36
- it 'raises error for negative max_chars in chunking' do
37
- # rubocop:disable Style/MultilineBlockChain
38
- expect do
39
- Kreuzberg::Config::Extraction.new(
40
- chunking: Kreuzberg::Config::Chunking.new(max_chars: -100)
41
- )
42
- end.to raise_error do |error|
43
- expect(error).to be_a(StandardError)
44
- expect(error.message.downcase).to match(/negative|invalid|positive|max_chars/)
45
- end
46
- # rubocop:enable Style/MultilineBlockChain
47
- end
48
-
49
- it 'raises error for negative max_overlap in chunking' do
50
- # rubocop:disable Style/MultilineBlockChain
51
- expect do
52
- Kreuzberg::Config::Chunking.new(max_overlap: -50)
53
- end.to raise_error do |error|
54
- expect(error).to be_a(StandardError)
55
- expect(error.message.downcase).to match(/negative|invalid|overlap/)
56
- end
57
- # rubocop:enable Style/MultilineBlockChain
58
- end
59
-
60
- it 'raises ArgumentError for invalid OCR config type' do
61
- # rubocop:disable Style/MultilineBlockChain
62
- expect do
63
- Kreuzberg::Config::Extraction.new(ocr: 'invalid_string')
64
- end.to raise_error(ArgumentError) do |error|
65
- expect(error.message).to include('Expected')
66
- expect(error.message).to include('OCR')
67
- end
68
- # rubocop:enable Style/MultilineBlockChain
69
- end
70
-
71
- it 'raises ArgumentError for invalid chunking config type' do
37
+ describe 'file not found errors' do
38
+ it 'raises error for non-existent file' do
72
39
  expect do
73
- Kreuzberg::Config::Extraction.new(chunking: 123)
74
- end.to raise_error(ArgumentError)
75
- end
76
-
77
- it 'raises ArgumentError for invalid language_detection config' do
78
- expect do
79
- Kreuzberg::Config::Extraction.new(language_detection: [])
80
- end.to raise_error(ArgumentError)
81
- end
82
-
83
- it 'raises ArgumentError for invalid pdf_options config' do
84
- expect do
85
- Kreuzberg::Config::Extraction.new(pdf_options: 'invalid_string')
86
- end.to raise_error(ArgumentError)
87
- end
88
-
89
- it 'provides descriptive error messages for config validation' do
90
- error = nil
91
- begin
92
- Kreuzberg::Config::Extraction.new(ocr: 12_345)
93
- rescue ArgumentError => e
94
- error = e
95
- end
96
-
97
- expect(error).not_to be_nil
98
- expect(error.message).to be_a(String)
99
- expect(error.message).not_to be_empty
100
- end
101
- end
102
-
103
- describe 'file not found and corrupted files' do
104
- it 'raises error for non-existent file with meaningful message' do
105
- # rubocop:disable Style/MultilineBlockChain
106
- expect do
107
- Kreuzberg.extract_file_sync(path: '/nonexistent/path/file.txt')
108
- end.to raise_error do |error|
109
- expect(error).to be_a(StandardError)
110
- expect(error.message).not_to be_empty
111
- end
112
- # rubocop:enable Style/MultilineBlockChain
40
+ Kreuzberg.extract_file_sync('/nonexistent/path/file.txt')
41
+ end.to raise_error(StandardError)
113
42
  end
114
43
 
115
- it 'raises error for empty file path' do
44
+ it 'raises error for empty path' do
116
45
  expect do
117
- Kreuzberg.extract_file_sync(path: '')
46
+ Kreuzberg.extract_file_sync('')
118
47
  end.to raise_error(StandardError)
119
48
  end
120
49
 
121
- it 'raises error for nil file path' do
50
+ it 'raises error for nil path' do
122
51
  expect do
123
- Kreuzberg.extract_file_sync(path: nil)
52
+ Kreuzberg.extract_file_sync(nil)
124
53
  end.to raise_error(StandardError)
125
54
  end
55
+ end
126
56
 
127
- it 'handles corrupted file gracefully' do
128
- # Create a file with binary garbage that is not a valid document
129
- corrupted_path = create_test_file("\x00\x01\x02\xFF\xFE\xFD", filename: 'corrupted.bin')
57
+ describe 'invalid MIME type handling' do
58
+ it 'handles unknown MIME types' do
59
+ path = create_test_file('Unknown MIME')
130
60
 
61
+ # Implementation may either handle gracefully or raise error for unknown MIME types
131
62
  begin
132
- result = Kreuzberg.extract_file_sync(path: corrupted_path, mime_type: 'application/octet-stream')
133
- # May succeed with empty content or raise error - both acceptable
63
+ result = Kreuzberg.extract_file_sync(path, mime_type: 'application/x-unknown-type')
134
64
  expect(result).to be_a(Kreuzberg::Result)
135
- rescue Kreuzberg::Errors::ParsingError => e
136
- expect(e).to be_a(Kreuzberg::Errors::ParsingError)
137
- expect(e.message).not_to be_empty
138
65
  rescue StandardError => e
139
66
  expect(e).to be_a(StandardError)
140
67
  end
141
68
  end
142
69
  end
143
70
 
144
- describe 'invalid MIME type handling' do
145
- it 'gracefully handles unknown MIME types' do
146
- path = create_test_file('Content with unknown type')
147
-
148
- result_or_error = nil
149
- begin
150
- result_or_error = Kreuzberg.extract_file_sync(path, mime_type: 'application/x-custom-unknown-format')
151
- rescue Kreuzberg::Errors::UnsupportedFormatError, StandardError => e
152
- result_or_error = e
153
- end
154
-
155
- if result_or_error.is_a?(Kreuzberg::Result)
156
- expect(result_or_error).to be_a(Kreuzberg::Result)
157
- else
158
- expect(result_or_error).to be_a(StandardError)
159
- expect(result_or_error.message).not_to be_empty
160
- end
71
+ describe 'invalid configuration' do
72
+ it 'raises error for invalid ocr config' do
73
+ expect do
74
+ Kreuzberg::Config::Extraction.new(ocr: 'invalid')
75
+ end.to raise_error(ArgumentError)
161
76
  end
162
77
 
163
- it 'handles malformed MIME type strings' do
164
- path = create_test_file('Test content')
165
-
166
- # Either succeeds or raises with meaningful error - both acceptable
167
- result_or_error = nil
168
- begin
169
- result_or_error = Kreuzberg.extract_file_sync(path, mime_type: '///invalid@@@')
170
- rescue StandardError => e
171
- result_or_error = e
172
- end
173
-
174
- expect([Kreuzberg::Result, StandardError].any? { |klass| result_or_error.is_a?(klass) }).to be_truthy
78
+ it 'raises error for invalid chunking config' do
79
+ expect do
80
+ Kreuzberg::Config::Extraction.new(chunking: 123)
81
+ end.to raise_error(ArgumentError)
175
82
  end
176
83
 
177
- it 'rejects empty MIME type with appropriate error' do
178
- path = create_test_file('Test')
179
-
180
- # Empty MIME type should either be rejected or handled gracefully
181
- result_or_error = nil
182
- begin
183
- Kreuzberg.extract_file_sync(path, mime_type: '')
184
- rescue StandardError => e
185
- result_or_error = e
186
- end
187
-
188
- expect(result_or_error).to be_a(StandardError) if result_or_error
84
+ it 'raises error for invalid language_detection config' do
85
+ expect do
86
+ Kreuzberg::Config::Extraction.new(language_detection: [])
87
+ end.to raise_error(ArgumentError)
189
88
  end
190
- end
191
-
192
- describe 'permission and I/O errors' do
193
- it 'raises IOError or subclass for permission denied scenario' do
194
- # This is environment-dependent, so we test gracefully
195
-
196
- # Try to write to a file we cannot read from (if setup permits)
197
- test_file = create_test_file('test content')
198
- File.chmod(0o000, test_file)
199
89
 
200
- begin
201
- Kreuzberg.extract_file_sync(path: test_file)
202
- ensure
203
- File.chmod(0o644, test_file)
204
- end
205
- rescue Kreuzberg::Errors::IOError => e
206
- expect(e).to be_a(Kreuzberg::Errors::IOError)
207
- rescue Errno::EACCES
208
- # Platform-specific permission error is acceptable
209
- expect(true).to be_truthy
210
- rescue StandardError => e
211
- # Other IO errors are acceptable
212
- expect(e).to be_a(StandardError)
90
+ it 'raises error for invalid pdf_options config' do
91
+ expect do
92
+ Kreuzberg::Config::Extraction.new(pdf_options: 'invalid')
93
+ end.to raise_error(ArgumentError)
213
94
  end
214
95
  end
215
96
 
216
- describe 'malformed document handling' do
217
- it 'handles invalid JSON metadata gracefully' do
218
- result = Kreuzberg::Result.new(
219
- content: 'Test content',
220
- mime_type: 'text/plain',
221
- metadata_json: 'this is not valid json {'
222
- )
223
-
224
- expect(result.content).to eq('Test content')
225
- expect(result.metadata).to eq({})
226
- expect(result.metadata).to be_a(Hash)
227
- end
228
-
229
- it 'handles empty metadata JSON' do
230
- result = Kreuzberg::Result.new(
231
- content: 'Test',
232
- mime_type: 'text/plain',
233
- metadata_json: ''
234
- )
235
-
236
- expect(result.metadata).to eq({})
237
- expect(result.content).to eq('Test')
238
- end
239
-
240
- it 'handles nil metadata JSON' do
241
- result = Kreuzberg::Result.new(
242
- content: 'Test',
243
- mime_type: 'text/plain',
244
- metadata_json: nil
245
- )
246
-
247
- expect(result.metadata).to eq({})
248
- end
249
-
250
- it 'handles malformed result object gracefully' do
251
- result = Kreuzberg::Result.new({})
252
-
253
- expect(result.content).to eq('')
254
- expect(result.mime_type).to eq('')
255
- expect(result.metadata).to eq({})
256
- expect(result.tables).to eq([])
257
- expect(result.detected_languages).to be_nil
258
- expect(result.chunks).to eq([])
259
- expect(result.images).to be_nil
260
- end
261
-
262
- it 'handles partial result data without errors' do
263
- result = Kreuzberg::Result.new(
264
- content: 'Partial content',
265
- mime_type: 'text/plain'
266
- )
267
-
268
- expect(result.content).to eq('Partial content')
269
- expect(result.mime_type).to eq('text/plain')
270
- expect(result.tables).to eq([])
271
- expect(result.metadata).to eq({})
97
+ describe 'error context' do
98
+ it 'provides meaningful error messages' do
99
+ Kreuzberg.extract_file_sync('/nonexistent/file.pdf')
100
+ raise 'Expected an error to be raised'
101
+ rescue StandardError => e
102
+ expect(e.message).not_to be_empty
272
103
  end
273
104
  end
274
105
 
275
- describe 'batch extraction error handling' do
276
- it 'handles mixed valid and invalid files in batch' do
106
+ describe 'batch extraction errors' do
107
+ it 'handles mixed valid and invalid files' do
277
108
  files = [
278
- create_test_file('Valid file content'),
109
+ create_test_file('Valid'),
279
110
  '/definitely/nonexistent/file.txt'
280
111
  ]
281
112
 
113
+ # Implementation may either raise error or handle gracefully
282
114
  begin
283
115
  result = Kreuzberg.batch_extract_files_sync(files)
284
116
  expect(result).to be_an(Array)
285
117
  rescue StandardError => e
286
118
  expect(e).to be_a(StandardError)
287
- expect(e.message).not_to be_empty
288
119
  end
289
120
  end
290
121
 
291
- it 'handles all invalid files in batch without crashing' do
122
+ it 'handles all invalid files' do
292
123
  files = [
293
124
  '/nonexistent1.txt',
294
125
  '/nonexistent2.txt',
295
126
  '/nonexistent3.txt'
296
127
  ]
297
128
 
129
+ # Batch operations may either fail fast or return partial results
298
130
  begin
299
131
  result = Kreuzberg.batch_extract_files_sync(files)
132
+ # If no error is raised, result should be an array (possibly empty or with errors)
300
133
  expect(result).to be_an(Array)
301
134
  rescue StandardError => e
302
- expect(e).to be_a(StandardError)
303
- end
304
- end
305
-
306
- it 'provides error context in batch results' do
307
- files = [
308
- create_test_file('First file'),
309
- '/nonexistent/second.txt'
310
- ]
311
-
312
- begin
313
- results = Kreuzberg.batch_extract_files_sync(files)
314
- expect(results).to be_an(Array)
315
- rescue StandardError => e
135
+ # If error is raised, it should be a StandardError
316
136
  expect(e).to be_a(StandardError)
317
137
  end
318
138
  end
319
139
  end
320
140
 
321
- describe 'concurrent error states' do
322
- it 'handles rapid successive error operations' do
323
- errors = []
324
-
325
- 3.times do |i|
326
- Kreuzberg.extract_file_sync(path: "/nonexistent#{i}.pdf")
327
- rescue StandardError => e
328
- errors << e
329
- end
330
-
331
- expect(errors.length).to eq(3)
332
- expect(errors).to all(be_a(StandardError))
141
+ describe 'async error handling' do
142
+ it 'propagates errors in async extraction' do
143
+ expect do
144
+ Kreuzberg.extract_file('/nonexistent/async/file.txt')
145
+ end.to raise_error(StandardError)
333
146
  end
334
- end
335
-
336
- describe 'error recovery and graceful degradation' do
337
- it 'recovers gracefully after file not found error' do
338
- # First operation: try to extract from nonexistent file
339
- error_caught = false
340
- begin
341
- Kreuzberg.extract_file_sync(path: '/nonexistent/does_not_exist.txt')
342
- rescue StandardError
343
- error_caught = true
344
- end
345
-
346
- expect(error_caught).to be_truthy
347
147
 
348
- # Second operation: should work fine with valid file
349
- valid_file = create_test_file('Valid content after error')
350
- result = Kreuzberg.extract_file_sync(path: valid_file)
148
+ it 'propagates errors in async bytes extraction' do
149
+ # Implementation may either handle invalid MIME types or raise error
351
150
 
151
+ result = Kreuzberg.extract_bytes('data', 'invalid/mime/type/that/causes/error')
352
152
  expect(result).to be_a(Kreuzberg::Result)
153
+ rescue StandardError => e
154
+ expect(e).to be_a(StandardError)
353
155
  end
156
+ end
354
157
 
355
- it 'handles mixed error and success scenarios in sequence' do
356
- results = []
357
-
358
- # Try invalid file
359
- begin
360
- Kreuzberg.extract_file_sync(path: '/nonexistent/file1.txt')
361
- rescue StandardError
362
- results << :error1
363
- end
158
+ describe 'result parsing errors' do
159
+ it 'handles malformed result gracefully' do
160
+ # This tests the Result class constructor with edge cases
161
+ result = Kreuzberg::Result.new({})
364
162
 
365
- # Valid extraction
366
- valid_file = create_test_file('Valid content')
367
- Kreuzberg.extract_file_sync(valid_file)
368
- results << :success1
163
+ expect(result.content).to eq('')
164
+ expect(result.mime_type).to eq('')
165
+ expect(result.metadata).to eq({})
166
+ expect(result.tables).to eq([])
167
+ expect(result.detected_languages).to be_nil
168
+ expect(result.chunks).to be_nil
169
+ expect(result.images).to be_nil
170
+ end
369
171
 
370
- # Another invalid file
371
- begin
372
- Kreuzberg.extract_file_sync(path: '/nonexistent/file2.txt')
373
- rescue StandardError
374
- results << :error2
375
- end
172
+ it 'handles partial result data' do
173
+ result = Kreuzberg::Result.new(
174
+ content: 'Test',
175
+ mime_type: 'text/plain'
176
+ )
376
177
 
377
- expect(results).to eq(%i[error1 success1 error2])
178
+ expect(result.content).to eq('Test')
179
+ expect(result.mime_type).to eq('text/plain')
180
+ expect(result.tables).to eq([])
378
181
  end
379
- end
380
182
 
381
- describe 'type conversion and coercion errors' do
382
- it 'handles non-string content in results gracefully' do
383
- path = create_test_file('Type coercion test')
384
- result = Kreuzberg.extract_file_sync(path: path)
183
+ it 'parses invalid metadata JSON' do
184
+ result = Kreuzberg::Result.new(
185
+ content: 'Test',
186
+ mime_type: 'text/plain',
187
+ metadata_json: 'invalid json{'
188
+ )
385
189
 
386
- expect(result.content).to be_a(String)
387
- expect(result.mime_type).to be_a(String)
190
+ expect(result.metadata).to eq({})
388
191
  end
389
192
 
390
- it 'extracts images with proper encoding handling' do
193
+ it 'parses extracted images' do
391
194
  result = Kreuzberg::Result.new(image_result_payload)
392
195
  image = result.images&.first
393
196
 
@@ -396,4 +199,15 @@ RSpec.describe 'Error Handling' do
396
199
  expect(image&.ocr_result).to be_a(Kreuzberg::Result)
397
200
  end
398
201
  end
202
+
203
+ describe 'type conversion errors' do
204
+ it 'handles non-string content gracefully' do
205
+ # Test that the wrapper handles type coercion
206
+ path = create_test_file('Type test')
207
+ result = Kreuzberg.extract_file_sync(path)
208
+
209
+ expect(result.content).to be_a(String)
210
+ expect(result.mime_type).to be_a(String)
211
+ end
212
+ end
399
213
  end
@@ -36,7 +36,7 @@ RSpec.describe 'OCR Backend Plugin System' do
36
36
  ocr: Kreuzberg::Config::Ocr.new(backend: 'mock-ocr')
37
37
  )
38
38
 
39
- result = Kreuzberg.extract_file_sync(path: test_image, config: config)
39
+ result = Kreuzberg.extract_file_sync(test_image, config: config)
40
40
 
41
41
  expect(backend.process_called).to be true
42
42
  expect(result.content).to include('Mocked OCR text')
@@ -69,7 +69,7 @@ RSpec.describe 'OCR Backend Plugin System' do
69
69
  )
70
70
  )
71
71
 
72
- Kreuzberg.extract_file_sync(path: test_image, config: config)
72
+ Kreuzberg.extract_file_sync(test_image, config: config)
73
73
 
74
74
  expect(backend.received_config).to be_a(Hash)
75
75
  expect(backend.received_config['backend']).to eq('config-capture')
@@ -102,7 +102,7 @@ RSpec.describe 'OCR Backend Plugin System' do
102
102
  ocr: Kreuzberg::Config::Ocr.new(backend: 'bytes-capture')
103
103
  )
104
104
 
105
- Kreuzberg.extract_file_sync(path: test_image, config: config)
105
+ Kreuzberg.extract_file_sync(test_image, config: config)
106
106
 
107
107
  received_bytes = BytesCapturingBackend.instance_variable_get(:@received_bytes)
108
108
  expect(received_bytes).to be_a(String)
@@ -131,7 +131,7 @@ RSpec.describe 'OCR Backend Plugin System' do
131
131
  ocr: Kreuzberg::Config::Ocr.new(backend: 'simple-ocr')
132
132
  )
133
133
 
134
- result = Kreuzberg.extract_file_sync(path: test_image, config: config)
134
+ result = Kreuzberg.extract_file_sync(test_image, config: config)
135
135
 
136
136
  expect(result.content).to include('Invoice Total')
137
137
  expect(result.content).to include('1,234.56')
@@ -167,8 +167,8 @@ RSpec.describe 'OCR Backend Plugin System' do
167
167
  ocr: Kreuzberg::Config::Ocr.new(backend: 'stateful-ocr')
168
168
  )
169
169
 
170
- Kreuzberg.extract_file_sync(path: test_image, config: config)
171
- Kreuzberg.extract_file_sync(path: test_image, config: config)
170
+ Kreuzberg.extract_file_sync(test_image, config: config)
171
+ Kreuzberg.extract_file_sync(test_image, config: config)
172
172
 
173
173
  expect(backend.call_count).to be >= 1
174
174
  end
@@ -197,7 +197,7 @@ RSpec.describe 'OCR Backend Plugin System' do
197
197
  )
198
198
 
199
199
  expect do
200
- Kreuzberg.extract_file_sync(path: test_image, config: config)
200
+ Kreuzberg.extract_file_sync(test_image, config: config)
201
201
  end.to raise_error(StandardError, /OCR processing failed/)
202
202
  end
203
203
 
@@ -208,7 +208,7 @@ RSpec.describe 'OCR Backend Plugin System' do
208
208
  )
209
209
 
210
210
  expect do
211
- Kreuzberg.extract_file_sync(path: test_image, config: config)
211
+ Kreuzberg.extract_file_sync(test_image, config: config)
212
212
  end.to raise_error
213
213
  end
214
214
  end