kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,209 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- RSpec.describe Kreuzberg::Config::ImageExtraction do
4
- describe '#initialize' do
5
- it 'creates config with default values' do
6
- config = described_class.new
7
-
8
- expect(config.extract_images).to be true
9
- expect(config.target_dpi).to eq 300
10
- expect(config.max_image_dimension).to eq 2000
11
- expect(config.auto_adjust_dpi).to be true
12
- expect(config.min_dpi).to eq 150
13
- expect(config.max_dpi).to eq 600
14
- end
15
-
16
- it 'creates config with custom values' do
17
- config = described_class.new(
18
- extract_images: false,
19
- target_dpi: 600,
20
- max_image_dimension: 4000,
21
- auto_adjust_dpi: false,
22
- min_dpi: 100,
23
- max_dpi: 1200
24
- )
25
-
26
- expect(config.extract_images).to be false
27
- expect(config.target_dpi).to eq 600
28
- expect(config.max_image_dimension).to eq 4000
29
- expect(config.auto_adjust_dpi).to be false
30
- expect(config.min_dpi).to eq 100
31
- expect(config.max_dpi).to eq 1200
32
- end
33
-
34
- it 'converts values to integers' do
35
- config = described_class.new(
36
- target_dpi: '300',
37
- max_image_dimension: '2000',
38
- min_dpi: '150',
39
- max_dpi: '600'
40
- )
41
-
42
- expect(config.target_dpi).to eq 300
43
- expect(config.max_image_dimension).to eq 2000
44
- expect(config.min_dpi).to eq 150
45
- expect(config.max_dpi).to eq 600
46
- expect(config.target_dpi).to be_a Integer
47
- end
48
-
49
- it 'converts boolean values correctly' do
50
- config = described_class.new(
51
- extract_images: true,
52
- auto_adjust_dpi: false
53
- )
54
-
55
- expect(config.extract_images).to be true
56
- expect(config.auto_adjust_dpi).to be false
57
- end
58
- end
59
-
60
- describe '#to_h' do
61
- it 'serializes to hash with all values' do
62
- config = described_class.new(
63
- target_dpi: 300,
64
- max_image_dimension: 2000
65
- )
66
- hash = config.to_h
67
-
68
- expect(hash).to be_a Hash
69
- expect(hash[:extract_images]).to be true
70
- expect(hash[:target_dpi]).to eq 300
71
- expect(hash[:max_image_dimension]).to eq 2000
72
- expect(hash[:auto_adjust_dpi]).to be true
73
- expect(hash[:min_dpi]).to eq 150
74
- expect(hash[:max_dpi]).to eq 600
75
- end
76
-
77
- it 'always includes all keys in hash' do
78
- config = described_class.new
79
- hash = config.to_h
80
-
81
- expect(hash.keys).to contain_exactly(
82
- :extract_images,
83
- :target_dpi,
84
- :max_image_dimension,
85
- :auto_adjust_dpi,
86
- :min_dpi,
87
- :max_dpi
88
- )
89
- end
90
- end
91
-
92
- describe 'validation' do
93
- it 'accepts valid DPI values' do
94
- expect do
95
- described_class.new(target_dpi: 300, min_dpi: 150, max_dpi: 600)
96
- end.not_to raise_error
97
- end
98
-
99
- it 'accepts valid image dimensions' do
100
- expect do
101
- described_class.new(max_image_dimension: 4000)
102
- end.not_to raise_error
103
- end
104
-
105
- it 'converts float DPI to integer' do
106
- config = described_class.new(target_dpi: 300.5)
107
-
108
- expect(config.target_dpi).to eq 300
109
- expect(config.target_dpi).to be_a Integer
110
- end
111
- end
112
-
113
- describe 'keyword arguments' do
114
- it 'accepts all keyword arguments' do
115
- config = described_class.new(
116
- extract_images: true,
117
- target_dpi: 600,
118
- max_image_dimension: 3000,
119
- auto_adjust_dpi: true,
120
- min_dpi: 200,
121
- max_dpi: 800
122
- )
123
-
124
- expect(config.extract_images).to be true
125
- expect(config.target_dpi).to eq 600
126
- expect(config.max_image_dimension).to eq 3000
127
- expect(config.auto_adjust_dpi).to be true
128
- expect(config.min_dpi).to eq 200
129
- expect(config.max_dpi).to eq 800
130
- end
131
- end
132
-
133
- describe 'equality' do
134
- it 'compares configs by value' do
135
- config1 = described_class.new(target_dpi: 300, max_image_dimension: 2000)
136
- config2 = described_class.new(target_dpi: 300, max_image_dimension: 2000)
137
-
138
- expect(config1.target_dpi).to eq config2.target_dpi
139
- expect(config1.max_image_dimension).to eq config2.max_image_dimension
140
- end
141
-
142
- it 'detects differences in DPI' do
143
- config1 = described_class.new(target_dpi: 300)
144
- config2 = described_class.new(target_dpi: 600)
145
-
146
- expect(config1.target_dpi).not_to eq config2.target_dpi
147
- end
148
-
149
- it 'detects differences in extract_images' do
150
- config1 = described_class.new(extract_images: true)
151
- config2 = described_class.new(extract_images: false)
152
-
153
- expect(config1.extract_images).not_to eq config2.extract_images
154
- end
155
- end
156
-
157
- describe 'nested config integration' do
158
- it 'can be nested in Extraction config' do
159
- image_config = described_class.new(target_dpi: 600)
160
- extraction = Kreuzberg::Config::Extraction.new(image_extraction: image_config)
161
-
162
- expect(extraction.image_extraction).to be_a described_class
163
- expect(extraction.image_extraction.target_dpi).to eq 600
164
- end
165
-
166
- it 'accepts hash in Extraction config' do
167
- extraction = Kreuzberg::Config::Extraction.new(
168
- image_extraction: { target_dpi: 600, extract_images: true }
169
- )
170
-
171
- expect(extraction.image_extraction).to be_a described_class
172
- expect(extraction.image_extraction.target_dpi).to eq 600
173
- end
174
- end
175
-
176
- describe 'DPI range' do
177
- it 'allows realistic DPI values' do
178
- config = described_class.new(min_dpi: 150, max_dpi: 1200)
179
-
180
- expect(config.min_dpi).to eq 150
181
- expect(config.max_dpi).to eq 1200
182
- end
183
-
184
- it 'maintains DPI relationships' do
185
- config = described_class.new(
186
- target_dpi: 300,
187
- min_dpi: 100,
188
- max_dpi: 600
189
- )
190
-
191
- expect(config.min_dpi).to be <= config.target_dpi
192
- expect(config.target_dpi).to be <= config.max_dpi
193
- end
194
- end
195
-
196
- describe 'image dimension constraints' do
197
- it 'accepts large image dimensions' do
198
- config = described_class.new(max_image_dimension: 10_000)
199
-
200
- expect(config.max_image_dimension).to eq 10_000
201
- end
202
-
203
- it 'accepts small image dimensions' do
204
- config = described_class.new(max_image_dimension: 100)
205
-
206
- expect(config.max_image_dimension).to eq 100
207
- end
208
- end
209
- end
@@ -1,249 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- RSpec.describe Kreuzberg::Config::ImagePreprocessing do
4
- describe '#initialize' do
5
- it 'creates config with default values' do
6
- config = described_class.new
7
-
8
- expect(config.target_dpi).to eq 300
9
- expect(config.auto_rotate).to be true
10
- expect(config.deskew).to be true
11
- expect(config.denoise).to be false
12
- expect(config.contrast_enhance).to be true
13
- expect(config.binarization_method).to eq 'otsu'
14
- expect(config.invert_colors).to be false
15
- end
16
-
17
- it 'creates config with custom values' do
18
- config = described_class.new(
19
- target_dpi: 600,
20
- auto_rotate: false,
21
- deskew: false,
22
- denoise: true,
23
- contrast_enhance: false,
24
- binarization_method: 'sauvola',
25
- invert_colors: true
26
- )
27
-
28
- expect(config.target_dpi).to eq 600
29
- expect(config.auto_rotate).to be false
30
- expect(config.deskew).to be false
31
- expect(config.denoise).to be true
32
- expect(config.contrast_enhance).to be false
33
- expect(config.binarization_method).to eq 'sauvola'
34
- expect(config.invert_colors).to be true
35
- end
36
-
37
- it 'converts target_dpi to integer' do
38
- config = described_class.new(target_dpi: '300')
39
-
40
- expect(config.target_dpi).to eq 300
41
- expect(config.target_dpi).to be_a Integer
42
- end
43
-
44
- it 'converts binarization_method to string' do
45
- config = described_class.new(binarization_method: :niblack)
46
-
47
- expect(config.binarization_method).to eq 'niblack'
48
- expect(config.binarization_method).to be_a String
49
- end
50
- end
51
-
52
- describe '#to_h' do
53
- it 'serializes to hash with all values' do
54
- config = described_class.new(target_dpi: 300, denoise: true)
55
- hash = config.to_h
56
-
57
- expect(hash).to be_a Hash
58
- expect(hash[:target_dpi]).to eq 300
59
- expect(hash[:denoise]).to be true
60
- expect(hash[:auto_rotate]).to be true
61
- expect(hash[:binarization_method]).to eq 'otsu'
62
- end
63
-
64
- it 'always includes all keys in hash' do
65
- config = described_class.new
66
- hash = config.to_h
67
-
68
- expect(hash.keys).to contain_exactly(
69
- :target_dpi,
70
- :auto_rotate,
71
- :deskew,
72
- :denoise,
73
- :contrast_enhance,
74
- :binarization_method,
75
- :invert_colors
76
- )
77
- end
78
- end
79
-
80
- describe 'validation' do
81
- it 'rejects invalid binarization method' do
82
- expect do
83
- described_class.new(binarization_method: 'invalid_method')
84
- end.to raise_error ArgumentError, /Invalid binarization_method/
85
- end
86
-
87
- it 'accepts all valid binarization methods' do
88
- valid_methods = %w[otsu sauvola niblack wolf bradley adaptive]
89
-
90
- valid_methods.each do |method|
91
- expect do
92
- described_class.new(binarization_method: method)
93
- end.not_to raise_error
94
- end
95
- end
96
-
97
- it 'accepts binarization method as symbol' do
98
- expect do
99
- described_class.new(binarization_method: :sauvola)
100
- end.not_to raise_error
101
- end
102
- end
103
-
104
- describe 'keyword arguments' do
105
- it 'accepts all keyword arguments' do
106
- config = described_class.new(
107
- target_dpi: 600,
108
- auto_rotate: true,
109
- deskew: false,
110
- denoise: true,
111
- contrast_enhance: false,
112
- binarization_method: 'bradley',
113
- invert_colors: true
114
- )
115
-
116
- expect(config.target_dpi).to eq 600
117
- expect(config.auto_rotate).to be true
118
- expect(config.deskew).to be false
119
- expect(config.denoise).to be true
120
- expect(config.contrast_enhance).to be false
121
- expect(config.binarization_method).to eq 'bradley'
122
- expect(config.invert_colors).to be true
123
- end
124
- end
125
-
126
- describe 'equality' do
127
- it 'compares configs by value' do
128
- config1 = described_class.new(
129
- target_dpi: 300,
130
- binarization_method: 'otsu',
131
- denoise: true
132
- )
133
- config2 = described_class.new(
134
- target_dpi: 300,
135
- binarization_method: 'otsu',
136
- denoise: true
137
- )
138
-
139
- expect(config1.target_dpi).to eq config2.target_dpi
140
- expect(config1.binarization_method).to eq config2.binarization_method
141
- expect(config1.denoise).to eq config2.denoise
142
- end
143
-
144
- it 'detects differences in target_dpi' do
145
- config1 = described_class.new(target_dpi: 300)
146
- config2 = described_class.new(target_dpi: 600)
147
-
148
- expect(config1.target_dpi).not_to eq config2.target_dpi
149
- end
150
-
151
- it 'detects differences in binarization_method' do
152
- config1 = described_class.new(binarization_method: 'otsu')
153
- config2 = described_class.new(binarization_method: 'sauvola')
154
-
155
- expect(config1.binarization_method).not_to eq config2.binarization_method
156
- end
157
- end
158
-
159
- describe 'nested config integration' do
160
- it 'can be nested in Extraction config' do
161
- preprocessing = described_class.new(target_dpi: 600, denoise: true)
162
- extraction = Kreuzberg::Config::Extraction.new(image_preprocessing: preprocessing)
163
-
164
- expect(extraction.image_preprocessing).to be_a described_class
165
- expect(extraction.image_preprocessing.target_dpi).to eq 600
166
- expect(extraction.image_preprocessing.denoise).to be true
167
- end
168
-
169
- it 'accepts hash in Extraction config' do
170
- extraction = Kreuzberg::Config::Extraction.new(
171
- image_preprocessing: { target_dpi: 600, binarization_method: 'sauvola' }
172
- )
173
-
174
- expect(extraction.image_preprocessing).to be_a described_class
175
- expect(extraction.image_preprocessing.target_dpi).to eq 600
176
- expect(extraction.image_preprocessing.binarization_method).to eq 'sauvola'
177
- end
178
-
179
- it 'can be nested in Tesseract config' do
180
- preprocessing = described_class.new(denoise: true)
181
- tesseract = Kreuzberg::Config::Tesseract.new(preprocessing: preprocessing)
182
-
183
- expect(tesseract.options[:preprocessing]).to be_a described_class
184
- expect(tesseract.options[:preprocessing].denoise).to be true
185
- end
186
- end
187
-
188
- describe 'symbol vs string key handling' do
189
- it 'converts symbol binarization method to string' do
190
- config = described_class.new(binarization_method: :bradley)
191
-
192
- expect(config.binarization_method).to eq 'bradley'
193
- expect(config.binarization_method).to be_a String
194
- end
195
-
196
- it 'converts string target_dpi to integer' do
197
- config = described_class.new(target_dpi: '600')
198
-
199
- expect(config.target_dpi).to eq 600
200
- expect(config.target_dpi).to be_a Integer
201
- end
202
- end
203
-
204
- describe 'boolean conversion' do
205
- it 'converts truthy values to boolean' do
206
- config = described_class.new(
207
- auto_rotate: 1,
208
- deskew: 'yes',
209
- denoise: true
210
- )
211
-
212
- expect(config.auto_rotate).to be true
213
- expect(config.deskew).to be true
214
- expect(config.denoise).to be true
215
- end
216
-
217
- it 'converts false values to boolean' do
218
- config = described_class.new(
219
- auto_rotate: false,
220
- deskew: false,
221
- denoise: false
222
- )
223
-
224
- expect(config.auto_rotate).to be false
225
- expect(config.deskew).to be false
226
- expect(config.denoise).to be false
227
- end
228
- end
229
-
230
- describe 'DPI configuration' do
231
- it 'accepts realistic DPI values' do
232
- config = described_class.new(target_dpi: 300)
233
-
234
- expect(config.target_dpi).to eq 300
235
- end
236
-
237
- it 'accepts high DPI values' do
238
- config = described_class.new(target_dpi: 1200)
239
-
240
- expect(config.target_dpi).to eq 1200
241
- end
242
-
243
- it 'accepts low DPI values' do
244
- config = described_class.new(target_dpi: 72)
245
-
246
- expect(config.target_dpi).to eq 72
247
- end
248
- end
249
- end
@@ -1,229 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- RSpec.describe Kreuzberg::Config::Keywords do
4
- describe '#initialize' do
5
- it 'creates config with default values' do
6
- config = described_class.new
7
-
8
- expect(config.algorithm).to be_nil
9
- expect(config.max_keywords).to be_nil
10
- expect(config.min_score).to be_nil
11
- expect(config.ngram_range).to be_nil
12
- expect(config.language).to be_nil
13
- expect(config.yake_params).to be_nil
14
- expect(config.rake_params).to be_nil
15
- end
16
-
17
- it 'creates config with custom values' do
18
- config = described_class.new(
19
- algorithm: 'yake',
20
- max_keywords: 10,
21
- min_score: 0.5,
22
- ngram_range: [1, 3],
23
- language: 'en'
24
- )
25
-
26
- expect(config.algorithm).to eq 'yake'
27
- expect(config.max_keywords).to eq 10
28
- expect(config.min_score).to eq 0.5
29
- expect(config.ngram_range).to eq [1, 3]
30
- expect(config.language).to eq 'en'
31
- end
32
-
33
- it 'accepts yake_params as instance' do
34
- yake_params = Kreuzberg::Config::KeywordYakeParams.new(window_size: 3)
35
- config = described_class.new(yake_params: yake_params)
36
-
37
- expect(config.yake_params).to be_a Kreuzberg::Config::KeywordYakeParams
38
- expect(config.yake_params.window_size).to eq 3
39
- end
40
-
41
- it 'converts yake_params hash to instance' do
42
- config = described_class.new(yake_params: { window_size: 2 })
43
-
44
- expect(config.yake_params).to be_a Kreuzberg::Config::KeywordYakeParams
45
- expect(config.yake_params.window_size).to eq 2
46
- end
47
-
48
- it 'accepts rake_params as instance' do
49
- rake_params = Kreuzberg::Config::KeywordRakeParams.new(min_word_length: 3)
50
- config = described_class.new(rake_params: rake_params)
51
-
52
- expect(config.rake_params).to be_a Kreuzberg::Config::KeywordRakeParams
53
- end
54
-
55
- it 'converts rake_params hash to instance' do
56
- config = described_class.new(rake_params: { min_word_length: 2 })
57
-
58
- expect(config.rake_params).to be_a Kreuzberg::Config::KeywordRakeParams
59
- expect(config.rake_params.min_word_length).to eq 2
60
- end
61
- end
62
-
63
- describe '#to_h' do
64
- it 'serializes to hash' do
65
- config = described_class.new(algorithm: 'yake', max_keywords: 10)
66
- hash = config.to_h
67
-
68
- expect(hash).to be_a Hash
69
- expect(hash[:algorithm]).to eq 'yake'
70
- expect(hash[:max_keywords]).to eq 10
71
- end
72
-
73
- it 'includes nested params in hash' do
74
- config = described_class.new(
75
- algorithm: 'yake',
76
- yake_params: { window_size: 3 }
77
- )
78
- hash = config.to_h
79
-
80
- expect(hash[:yake_params]).to be_a Hash
81
- expect(hash[:yake_params][:window_size]).to eq 3
82
- end
83
-
84
- it 'compacts nil values from hash' do
85
- config = described_class.new(algorithm: 'rake')
86
- hash = config.to_h
87
-
88
- expect(hash.key?(:max_keywords)).to be false
89
- expect(hash.key?(:yake_params)).to be false
90
- end
91
- end
92
-
93
- describe 'validation' do
94
- it 'accepts valid algorithm names' do
95
- expect do
96
- described_class.new(algorithm: 'yake')
97
- end.not_to raise_error
98
- end
99
-
100
- it 'accepts valid max_keywords' do
101
- expect do
102
- described_class.new(max_keywords: 20)
103
- end.not_to raise_error
104
- end
105
-
106
- it 'raises error for invalid yake_params type' do
107
- expect do
108
- described_class.new(yake_params: 'invalid')
109
- end.to raise_error ArgumentError, /Expected.*KeywordYakeParams.*Hash.*nil/
110
- end
111
-
112
- it 'raises error for invalid rake_params type' do
113
- expect do
114
- described_class.new(rake_params: 'invalid')
115
- end.to raise_error ArgumentError, /Expected.*KeywordRakeParams.*Hash.*nil/
116
- end
117
- end
118
-
119
- describe 'keyword arguments' do
120
- it 'accepts all keyword arguments' do
121
- config = described_class.new(
122
- algorithm: 'yake',
123
- max_keywords: 15,
124
- min_score: 0.7,
125
- ngram_range: [1, 2],
126
- language: 'fr',
127
- yake_params: { window_size: 3 }
128
- )
129
-
130
- expect(config.algorithm).to eq 'yake'
131
- expect(config.max_keywords).to eq 15
132
- expect(config.min_score).to eq 0.7
133
- expect(config.ngram_range).to eq [1, 2]
134
- expect(config.language).to eq 'fr'
135
- expect(config.yake_params).to be_a Kreuzberg::Config::KeywordYakeParams
136
- end
137
- end
138
-
139
- describe 'equality' do
140
- it 'compares configs by value' do
141
- config1 = described_class.new(algorithm: 'yake', max_keywords: 10)
142
- config2 = described_class.new(algorithm: 'yake', max_keywords: 10)
143
-
144
- expect(config1.algorithm).to eq config2.algorithm
145
- expect(config1.max_keywords).to eq config2.max_keywords
146
- end
147
-
148
- it 'detects differences in algorithm' do
149
- config1 = described_class.new(algorithm: 'yake')
150
- config2 = described_class.new(algorithm: 'rake')
151
-
152
- expect(config1.algorithm).not_to eq config2.algorithm
153
- end
154
-
155
- it 'detects differences in max_keywords' do
156
- config1 = described_class.new(max_keywords: 10)
157
- config2 = described_class.new(max_keywords: 20)
158
-
159
- expect(config1.max_keywords).not_to eq config2.max_keywords
160
- end
161
- end
162
-
163
- describe 'nested config integration' do
164
- it 'can be nested in Extraction config' do
165
- keywords = described_class.new(algorithm: 'yake', max_keywords: 15)
166
- extraction = Kreuzberg::Config::Extraction.new(keywords: keywords)
167
-
168
- expect(extraction.keywords).to be_a described_class
169
- expect(extraction.keywords.algorithm).to eq 'yake'
170
- expect(extraction.keywords.max_keywords).to eq 15
171
- end
172
-
173
- it 'accepts hash in Extraction config' do
174
- extraction = Kreuzberg::Config::Extraction.new(
175
- keywords: { algorithm: 'rake', max_keywords: 10 }
176
- )
177
-
178
- expect(extraction.keywords).to be_a described_class
179
- expect(extraction.keywords.algorithm).to eq 'rake'
180
- expect(extraction.keywords.max_keywords).to eq 10
181
- end
182
- end
183
-
184
- describe 'symbol vs string key handling' do
185
- it 'converts symbol algorithm to string' do
186
- config = described_class.new(algorithm: :yake)
187
-
188
- expect(config.algorithm).to eq 'yake'
189
- expect(config.algorithm).to be_a String
190
- end
191
-
192
- it 'converts symbol language to string' do
193
- config = described_class.new(language: :eng)
194
-
195
- expect(config.language).to eq 'eng'
196
- expect(config.language).to be_a String
197
- end
198
-
199
- it 'converts ngram_range values to integers' do
200
- config = described_class.new(ngram_range: %w[1 3])
201
-
202
- expect(config.ngram_range).to eq [1, 3]
203
- expect(config.ngram_range.all?(Integer)).to be true
204
- end
205
- end
206
-
207
- describe 'parameter conversions' do
208
- it 'converts max_keywords to integer' do
209
- config = described_class.new(max_keywords: '20')
210
-
211
- expect(config.max_keywords).to eq 20
212
- expect(config.max_keywords).to be_a Integer
213
- end
214
-
215
- it 'converts min_score to float' do
216
- config = described_class.new(min_score: '0.75')
217
-
218
- expect(config.min_score).to eq 0.75
219
- expect(config.min_score).to be_a Float
220
- end
221
-
222
- it 'converts ngram_range to array of integers' do
223
- config = described_class.new(ngram_range: [1, 2])
224
-
225
- expect(config.ngram_range).to eq [1, 2]
226
- expect(config.ngram_range.all?(Integer)).to be true
227
- end
228
- end
229
- end