kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,438 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- RSpec.describe Kreuzberg::Config::Extraction do
4
- describe '#initialize' do
5
- it 'creates config with default values' do
6
- config = described_class.new
7
-
8
- expect(config.use_cache).to be true
9
- expect(config.enable_quality_processing).to be false
10
- expect(config.force_ocr).to be false
11
- expect(config.ocr).to be_nil
12
- expect(config.chunking).to be_nil
13
- expect(config.language_detection).to be_nil
14
- expect(config.pdf_options).to be_nil
15
- expect(config.image_extraction).to be_nil
16
- expect(config.image_preprocessing).to be_nil
17
- expect(config.postprocessor).to be_nil
18
- expect(config.token_reduction).to be_nil
19
- expect(config.keywords).to be_nil
20
- expect(config.html_options).to be_nil
21
- expect(config.pages).to be_nil
22
- expect(config.max_concurrent_extractions).to be_nil
23
- end
24
-
25
- it 'creates config with custom boolean values' do
26
- config = described_class.new(
27
- use_cache: false,
28
- enable_quality_processing: true,
29
- force_ocr: true
30
- )
31
-
32
- expect(config.use_cache).to be false
33
- expect(config.enable_quality_processing).to be true
34
- expect(config.force_ocr).to be true
35
- end
36
-
37
- it 'accepts all nested config instances' do
38
- ocr = Kreuzberg::Config::OCR.new(backend: 'tesseract')
39
- chunking = Kreuzberg::Config::Chunking.new(max_chars: 500)
40
- lang_detect = Kreuzberg::Config::LanguageDetection.new(enabled: true)
41
-
42
- config = described_class.new(
43
- ocr: ocr,
44
- chunking: chunking,
45
- language_detection: lang_detect
46
- )
47
-
48
- expect(config.ocr).to be ocr
49
- expect(config.chunking).to be chunking
50
- expect(config.language_detection).to be lang_detect
51
- end
52
-
53
- it 'converts nested config hashes to instances' do
54
- config = described_class.new(
55
- ocr: { backend: 'easyocr', language: 'fra' },
56
- chunking: { max_chars: 750 }
57
- )
58
-
59
- expect(config.ocr).to be_a Kreuzberg::Config::OCR
60
- expect(config.ocr.backend).to eq 'easyocr'
61
- expect(config.chunking).to be_a Kreuzberg::Config::Chunking
62
- expect(config.chunking.max_chars).to eq 750
63
- end
64
-
65
- it 'converts max_concurrent_extractions to integer' do
66
- config = described_class.new(max_concurrent_extractions: '4')
67
-
68
- expect(config.max_concurrent_extractions).to eq 4
69
- expect(config.max_concurrent_extractions).to be_a Integer
70
- end
71
- end
72
-
73
- describe '#to_h' do
74
- it 'serializes to hash' do
75
- config = described_class.new(use_cache: true)
76
- hash = config.to_h
77
-
78
- expect(hash).to be_a Hash
79
- expect(hash[:use_cache]).to be true
80
- end
81
-
82
- it 'includes all nested configs in hash' do
83
- config = described_class.new(
84
- ocr: { backend: 'tesseract' },
85
- chunking: { max_chars: 500 }
86
- )
87
- hash = config.to_h
88
-
89
- expect(hash[:ocr]).to be_a Hash
90
- expect(hash[:chunking]).to be_a Hash
91
- end
92
-
93
- it 'compacts nil nested configs from hash' do
94
- config = described_class.new(use_cache: true)
95
- hash = config.to_h
96
-
97
- expect(hash.key?(:ocr)).to be false
98
- expect(hash.key?(:chunking)).to be false
99
- end
100
-
101
- it 'always includes top-level boolean values' do
102
- config = described_class.new
103
- hash = config.to_h
104
-
105
- expect(hash[:use_cache]).to be true
106
- expect(hash[:enable_quality_processing]).to be false
107
- expect(hash[:force_ocr]).to be false
108
- end
109
- end
110
-
111
- describe '#to_json' do
112
- it 'serializes to JSON string' do
113
- config = described_class.new(use_cache: true, force_ocr: false)
114
- json = config.to_json
115
-
116
- expect(json).to be_a String
117
- parsed = JSON.parse(json)
118
- expect(parsed['use_cache']).to be true
119
- expect(parsed['force_ocr']).to be false
120
- end
121
-
122
- it 'handles nested configs in JSON' do
123
- config = described_class.new(ocr: { backend: 'tesseract' })
124
- json = config.to_json
125
-
126
- parsed = JSON.parse(json)
127
- expect(parsed['ocr']['backend']).to eq 'tesseract'
128
- end
129
- end
130
-
131
- describe '#get_field' do
132
- it 'retrieves top-level field' do
133
- config = described_class.new(use_cache: false)
134
-
135
- expect(config.get_field('use_cache')).to be false
136
- end
137
-
138
- it 'retrieves nested field with dot notation' do
139
- config = described_class.new(ocr: { backend: 'tesseract' })
140
-
141
- expect(config.get_field('ocr.backend')).to eq 'tesseract'
142
- end
143
-
144
- it 'returns nil for non-existent field' do
145
- config = described_class.new
146
-
147
- expect(config.get_field('nonexistent')).to be_nil
148
- end
149
-
150
- it 'accepts symbol field names' do
151
- config = described_class.new(use_cache: true)
152
-
153
- expect(config.get_field(:use_cache)).to be true
154
- end
155
-
156
- it 'handles deeply nested fields' do
157
- config = described_class.new(
158
- chunking: { embedding: { model: { type: :preset, name: 'fast' } } }
159
- )
160
-
161
- expect(config.get_field('chunking.embedding.model')).to be_a Hash
162
- end
163
- end
164
-
165
- describe '#merge' do
166
- it 'merges two configs' do
167
- base = described_class.new(use_cache: true, force_ocr: false)
168
- override = described_class.new(force_ocr: true)
169
- merged = base.merge(override)
170
-
171
- expect(merged.use_cache).to be true
172
- expect(merged.force_ocr).to be true
173
- end
174
-
175
- it 'returns new config without modifying original' do
176
- base = described_class.new(use_cache: true)
177
- override = described_class.new(use_cache: false)
178
- merged = base.merge(override)
179
-
180
- expect(base.use_cache).to be true
181
- expect(merged.use_cache).to be false
182
- end
183
-
184
- it 'merges nested configs' do
185
- base = described_class.new(ocr: { backend: 'tesseract' })
186
- override = described_class.new(ocr: { language: 'fra' })
187
- merged = base.merge(override)
188
-
189
- expect(merged.ocr.backend).to eq 'tesseract'
190
- end
191
-
192
- it 'accepts hash as merge argument' do
193
- base = described_class.new(use_cache: true)
194
- merged = base.merge({ use_cache: false })
195
-
196
- expect(merged.use_cache).to be false
197
- end
198
- end
199
-
200
- describe '#merge!' do
201
- it 'mutates config in-place' do
202
- config = described_class.new(use_cache: true, force_ocr: false)
203
- override = described_class.new(force_ocr: true)
204
- result = config.merge!(override)
205
-
206
- expect(config.force_ocr).to be true
207
- expect(result).to be config
208
- end
209
-
210
- it 'returns self' do
211
- config = described_class.new
212
- override = described_class.new
213
-
214
- expect(config.merge!(override)).to be config
215
- end
216
-
217
- it 'accepts hash argument' do
218
- config = described_class.new(use_cache: true)
219
- config[:use_cache] = false
220
- config[:force_ocr] = true
221
-
222
- expect(config.use_cache).to be false
223
- expect(config.force_ocr).to be true
224
- end
225
- end
226
-
227
- describe 'validation' do
228
- it 'rejects invalid ocr type' do
229
- expect do
230
- described_class.new(ocr: 'invalid')
231
- end.to raise_error ArgumentError, /Expected.*OCR/
232
- end
233
-
234
- it 'rejects invalid chunking type' do
235
- expect do
236
- described_class.new(chunking: 123)
237
- end.to raise_error ArgumentError, /Expected.*Chunking/
238
- end
239
-
240
- it 'accepts valid nested instances' do
241
- expect do
242
- described_class.new(
243
- ocr: Kreuzberg::Config::OCR.new,
244
- chunking: Kreuzberg::Config::Chunking.new
245
- )
246
- end.not_to raise_error
247
- end
248
- end
249
-
250
- describe 'keyword arguments' do
251
- it 'accepts all keyword arguments' do
252
- config = described_class.new(
253
- use_cache: false,
254
- enable_quality_processing: true,
255
- force_ocr: true,
256
- ocr: { backend: 'tesseract' },
257
- chunking: { max_chars: 500 },
258
- language_detection: { enabled: true },
259
- pdf_options: { extract_images: true },
260
- image_extraction: { target_dpi: 600 },
261
- image_preprocessing: { denoise: true },
262
- postprocessor: { enabled: true },
263
- token_reduction: { mode: 'light' },
264
- keywords: { algorithm: 'yake' },
265
- pages: { extract_pages: true },
266
- max_concurrent_extractions: 4
267
- )
268
-
269
- expect(config.use_cache).to be false
270
- expect(config.enable_quality_processing).to be true
271
- expect(config.force_ocr).to be true
272
- expect(config.ocr).to be_a Kreuzberg::Config::OCR
273
- expect(config.max_concurrent_extractions).to eq 4
274
- end
275
- end
276
-
277
- describe 'equality' do
278
- it 'compares configs with same values' do
279
- config1 = described_class.new(use_cache: true, force_ocr: false)
280
- config2 = described_class.new(use_cache: true, force_ocr: false)
281
-
282
- expect(config1.use_cache).to eq config2.use_cache
283
- expect(config1.force_ocr).to eq config2.force_ocr
284
- end
285
-
286
- it 'detects differences' do
287
- config1 = described_class.new(use_cache: true)
288
- config2 = described_class.new(use_cache: false)
289
-
290
- expect(config1.use_cache).not_to eq config2.use_cache
291
- end
292
- end
293
-
294
- describe '.from_file' do
295
- it 'loads from TOML file' do
296
- config_path = File.join(__dir__, '../../fixtures/config.toml')
297
- config = described_class.from_file(config_path)
298
-
299
- expect(config).to be_a described_class
300
- expect(config.use_cache).to be false
301
- end
302
-
303
- it 'loads from YAML file' do
304
- config_path = File.join(__dir__, '../../fixtures/config.yaml')
305
- config = described_class.from_file(config_path)
306
-
307
- expect(config).to be_a described_class
308
- expect(config.use_cache).to be false
309
- end
310
-
311
- it 'raises error for non-existent file' do
312
- expect do
313
- described_class.from_file('/nonexistent/path/config.toml')
314
- end.to raise_error Kreuzberg::Errors::ValidationError
315
- end
316
- end
317
-
318
- describe '.discover' do
319
- it 'returns nil when no config file found' do
320
- # This test may vary by environment
321
- # Documenting the behavior
322
- config = described_class.discover
323
- # Should either return a config or nil
324
- expect(config.nil? || config.is_a?(described_class)).to be true
325
- end
326
- end
327
-
328
- describe 'boolean conversion' do
329
- it 'converts truthy use_cache to true' do
330
- config = described_class.new(use_cache: 1)
331
-
332
- expect(config.use_cache).to be true
333
- end
334
-
335
- it 'converts false use_cache to false' do
336
- config = described_class.new(use_cache: false)
337
-
338
- expect(config.use_cache).to be false
339
- end
340
-
341
- it 'converts truthy enable_quality_processing to true' do
342
- config = described_class.new(enable_quality_processing: 'yes')
343
-
344
- expect(config.enable_quality_processing).to be true
345
- end
346
-
347
- it 'converts false enable_quality_processing to false' do
348
- config = described_class.new(enable_quality_processing: false)
349
-
350
- expect(config.enable_quality_processing).to be false
351
- end
352
-
353
- it 'converts truthy force_ocr to true' do
354
- config = described_class.new(force_ocr: [1])
355
-
356
- expect(config.force_ocr).to be true
357
- end
358
-
359
- it 'converts false force_ocr to false' do
360
- config = described_class.new(force_ocr: false)
361
-
362
- expect(config.force_ocr).to be false
363
- end
364
- end
365
-
366
- describe 'complex nested configurations' do
367
- it 'handles deeply nested configs' do
368
- config = described_class.new(
369
- chunking: {
370
- max_chars: 750,
371
- embedding: {
372
- model: { type: :preset, name: 'balanced' },
373
- batch_size: 64
374
- }
375
- }
376
- )
377
-
378
- expect(config.chunking.embedding).to be_a Kreuzberg::Config::Embedding
379
- expect(config.chunking.embedding.batch_size).to eq 64
380
- end
381
-
382
- it 'handles PDF with font and hierarchy configs' do
383
- config = described_class.new(
384
- pdf_options: {
385
- extract_images: true,
386
- font_config: { enabled: true, custom_font_dirs: ['/fonts'] },
387
- hierarchy: { k_clusters: 8 }
388
- }
389
- )
390
-
391
- expect(config.pdf_options.font_config).to be_a Kreuzberg::Config::FontConfig
392
- expect(config.pdf_options.hierarchy).to be_a Kreuzberg::Config::Hierarchy
393
- end
394
-
395
- it 'handles complete extraction config' do
396
- config = described_class.new(
397
- use_cache: false,
398
- force_ocr: true,
399
- ocr: { backend: 'tesseract', language: 'deu' },
400
- chunking: { max_chars: 500, preset: 'fast' },
401
- language_detection: { enabled: true, min_confidence: 0.9 },
402
- pdf_options: { extract_images: true, passwords: ['secret'] },
403
- image_extraction: { target_dpi: 600 },
404
- image_preprocessing: { denoise: true, binarization_method: 'sauvola' },
405
- postprocessor: { enabled: true, enabled_processors: %w[quality] },
406
- token_reduction: { mode: 'light' },
407
- keywords: { algorithm: 'yake', max_keywords: 10 },
408
- pages: { extract_pages: true }
409
- )
410
-
411
- expect(config.use_cache).to be false
412
- expect(config.force_ocr).to be true
413
- expect(config.ocr.language).to eq 'deu'
414
- expect(config.chunking.max_chars).to eq 500
415
- expect(config.language_detection.enabled).to be true
416
- expect(config.pdf_options.extract_images).to be true
417
- expect(config.image_extraction.target_dpi).to eq 600
418
- expect(config.image_preprocessing.denoise).to be true
419
- expect(config.postprocessor.enabled).to be true
420
- expect(config.token_reduction.mode).to eq 'light'
421
- expect(config.keywords.max_keywords).to eq 10
422
- expect(config.pages.extract_pages).to be true
423
- end
424
- end
425
-
426
- describe 'ExtractionConfig alias' do
427
- it 'exists as module constant' do
428
- expect(Kreuzberg.const_defined?(:ExtractionConfig)).to be true
429
- end
430
-
431
- it 'can be instantiated through alias' do
432
- config = Kreuzberg::ExtractionConfig.new(use_cache: false)
433
-
434
- expect(config).to be_a described_class
435
- expect(config.use_cache).to be false
436
- end
437
- end
438
- end