kreuzberg 4.0.0.rc2 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +396 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -0,0 +1,209 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::ImageExtraction do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.extract_images).to be true
9
+ expect(config.target_dpi).to eq 300
10
+ expect(config.max_image_dimension).to eq 2000
11
+ expect(config.auto_adjust_dpi).to be true
12
+ expect(config.min_dpi).to eq 150
13
+ expect(config.max_dpi).to eq 600
14
+ end
15
+
16
+ it 'creates config with custom values' do
17
+ config = described_class.new(
18
+ extract_images: false,
19
+ target_dpi: 600,
20
+ max_image_dimension: 4000,
21
+ auto_adjust_dpi: false,
22
+ min_dpi: 100,
23
+ max_dpi: 1200
24
+ )
25
+
26
+ expect(config.extract_images).to be false
27
+ expect(config.target_dpi).to eq 600
28
+ expect(config.max_image_dimension).to eq 4000
29
+ expect(config.auto_adjust_dpi).to be false
30
+ expect(config.min_dpi).to eq 100
31
+ expect(config.max_dpi).to eq 1200
32
+ end
33
+
34
+ it 'converts values to integers' do
35
+ config = described_class.new(
36
+ target_dpi: '300',
37
+ max_image_dimension: '2000',
38
+ min_dpi: '150',
39
+ max_dpi: '600'
40
+ )
41
+
42
+ expect(config.target_dpi).to eq 300
43
+ expect(config.max_image_dimension).to eq 2000
44
+ expect(config.min_dpi).to eq 150
45
+ expect(config.max_dpi).to eq 600
46
+ expect(config.target_dpi).to be_a Integer
47
+ end
48
+
49
+ it 'converts boolean values correctly' do
50
+ config = described_class.new(
51
+ extract_images: true,
52
+ auto_adjust_dpi: false
53
+ )
54
+
55
+ expect(config.extract_images).to be true
56
+ expect(config.auto_adjust_dpi).to be false
57
+ end
58
+ end
59
+
60
+ describe '#to_h' do
61
+ it 'serializes to hash with all values' do
62
+ config = described_class.new(
63
+ target_dpi: 300,
64
+ max_image_dimension: 2000
65
+ )
66
+ hash = config.to_h
67
+
68
+ expect(hash).to be_a Hash
69
+ expect(hash[:extract_images]).to be true
70
+ expect(hash[:target_dpi]).to eq 300
71
+ expect(hash[:max_image_dimension]).to eq 2000
72
+ expect(hash[:auto_adjust_dpi]).to be true
73
+ expect(hash[:min_dpi]).to eq 150
74
+ expect(hash[:max_dpi]).to eq 600
75
+ end
76
+
77
+ it 'always includes all keys in hash' do
78
+ config = described_class.new
79
+ hash = config.to_h
80
+
81
+ expect(hash.keys).to contain_exactly(
82
+ :extract_images,
83
+ :target_dpi,
84
+ :max_image_dimension,
85
+ :auto_adjust_dpi,
86
+ :min_dpi,
87
+ :max_dpi
88
+ )
89
+ end
90
+ end
91
+
92
+ describe 'validation' do
93
+ it 'accepts valid DPI values' do
94
+ expect do
95
+ described_class.new(target_dpi: 300, min_dpi: 150, max_dpi: 600)
96
+ end.not_to raise_error
97
+ end
98
+
99
+ it 'accepts valid image dimensions' do
100
+ expect do
101
+ described_class.new(max_image_dimension: 4000)
102
+ end.not_to raise_error
103
+ end
104
+
105
+ it 'converts float DPI to integer' do
106
+ config = described_class.new(target_dpi: 300.5)
107
+
108
+ expect(config.target_dpi).to eq 300
109
+ expect(config.target_dpi).to be_a Integer
110
+ end
111
+ end
112
+
113
+ describe 'keyword arguments' do
114
+ it 'accepts all keyword arguments' do
115
+ config = described_class.new(
116
+ extract_images: true,
117
+ target_dpi: 600,
118
+ max_image_dimension: 3000,
119
+ auto_adjust_dpi: true,
120
+ min_dpi: 200,
121
+ max_dpi: 800
122
+ )
123
+
124
+ expect(config.extract_images).to be true
125
+ expect(config.target_dpi).to eq 600
126
+ expect(config.max_image_dimension).to eq 3000
127
+ expect(config.auto_adjust_dpi).to be true
128
+ expect(config.min_dpi).to eq 200
129
+ expect(config.max_dpi).to eq 800
130
+ end
131
+ end
132
+
133
+ describe 'equality' do
134
+ it 'compares configs by value' do
135
+ config1 = described_class.new(target_dpi: 300, max_image_dimension: 2000)
136
+ config2 = described_class.new(target_dpi: 300, max_image_dimension: 2000)
137
+
138
+ expect(config1.target_dpi).to eq config2.target_dpi
139
+ expect(config1.max_image_dimension).to eq config2.max_image_dimension
140
+ end
141
+
142
+ it 'detects differences in DPI' do
143
+ config1 = described_class.new(target_dpi: 300)
144
+ config2 = described_class.new(target_dpi: 600)
145
+
146
+ expect(config1.target_dpi).not_to eq config2.target_dpi
147
+ end
148
+
149
+ it 'detects differences in extract_images' do
150
+ config1 = described_class.new(extract_images: true)
151
+ config2 = described_class.new(extract_images: false)
152
+
153
+ expect(config1.extract_images).not_to eq config2.extract_images
154
+ end
155
+ end
156
+
157
+ describe 'nested config integration' do
158
+ it 'can be nested in Extraction config' do
159
+ image_config = described_class.new(target_dpi: 600)
160
+ extraction = Kreuzberg::Config::Extraction.new(image_extraction: image_config)
161
+
162
+ expect(extraction.image_extraction).to be_a described_class
163
+ expect(extraction.image_extraction.target_dpi).to eq 600
164
+ end
165
+
166
+ it 'accepts hash in Extraction config' do
167
+ extraction = Kreuzberg::Config::Extraction.new(
168
+ image_extraction: { target_dpi: 600, extract_images: true }
169
+ )
170
+
171
+ expect(extraction.image_extraction).to be_a described_class
172
+ expect(extraction.image_extraction.target_dpi).to eq 600
173
+ end
174
+ end
175
+
176
+ describe 'DPI range' do
177
+ it 'allows realistic DPI values' do
178
+ config = described_class.new(min_dpi: 150, max_dpi: 1200)
179
+
180
+ expect(config.min_dpi).to eq 150
181
+ expect(config.max_dpi).to eq 1200
182
+ end
183
+
184
+ it 'maintains DPI relationships' do
185
+ config = described_class.new(
186
+ target_dpi: 300,
187
+ min_dpi: 100,
188
+ max_dpi: 600
189
+ )
190
+
191
+ expect(config.min_dpi).to be <= config.target_dpi
192
+ expect(config.target_dpi).to be <= config.max_dpi
193
+ end
194
+ end
195
+
196
+ describe 'image dimension constraints' do
197
+ it 'accepts large image dimensions' do
198
+ config = described_class.new(max_image_dimension: 10_000)
199
+
200
+ expect(config.max_image_dimension).to eq 10_000
201
+ end
202
+
203
+ it 'accepts small image dimensions' do
204
+ config = described_class.new(max_image_dimension: 100)
205
+
206
+ expect(config.max_image_dimension).to eq 100
207
+ end
208
+ end
209
+ end
@@ -0,0 +1,249 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::ImagePreprocessing do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.target_dpi).to eq 300
9
+ expect(config.auto_rotate).to be true
10
+ expect(config.deskew).to be true
11
+ expect(config.denoise).to be false
12
+ expect(config.contrast_enhance).to be true
13
+ expect(config.binarization_method).to eq 'otsu'
14
+ expect(config.invert_colors).to be false
15
+ end
16
+
17
+ it 'creates config with custom values' do
18
+ config = described_class.new(
19
+ target_dpi: 600,
20
+ auto_rotate: false,
21
+ deskew: false,
22
+ denoise: true,
23
+ contrast_enhance: false,
24
+ binarization_method: 'sauvola',
25
+ invert_colors: true
26
+ )
27
+
28
+ expect(config.target_dpi).to eq 600
29
+ expect(config.auto_rotate).to be false
30
+ expect(config.deskew).to be false
31
+ expect(config.denoise).to be true
32
+ expect(config.contrast_enhance).to be false
33
+ expect(config.binarization_method).to eq 'sauvola'
34
+ expect(config.invert_colors).to be true
35
+ end
36
+
37
+ it 'converts target_dpi to integer' do
38
+ config = described_class.new(target_dpi: '300')
39
+
40
+ expect(config.target_dpi).to eq 300
41
+ expect(config.target_dpi).to be_a Integer
42
+ end
43
+
44
+ it 'converts binarization_method to string' do
45
+ config = described_class.new(binarization_method: :niblack)
46
+
47
+ expect(config.binarization_method).to eq 'niblack'
48
+ expect(config.binarization_method).to be_a String
49
+ end
50
+ end
51
+
52
+ describe '#to_h' do
53
+ it 'serializes to hash with all values' do
54
+ config = described_class.new(target_dpi: 300, denoise: true)
55
+ hash = config.to_h
56
+
57
+ expect(hash).to be_a Hash
58
+ expect(hash[:target_dpi]).to eq 300
59
+ expect(hash[:denoise]).to be true
60
+ expect(hash[:auto_rotate]).to be true
61
+ expect(hash[:binarization_method]).to eq 'otsu'
62
+ end
63
+
64
+ it 'always includes all keys in hash' do
65
+ config = described_class.new
66
+ hash = config.to_h
67
+
68
+ expect(hash.keys).to contain_exactly(
69
+ :target_dpi,
70
+ :auto_rotate,
71
+ :deskew,
72
+ :denoise,
73
+ :contrast_enhance,
74
+ :binarization_method,
75
+ :invert_colors
76
+ )
77
+ end
78
+ end
79
+
80
+ describe 'validation' do
81
+ it 'rejects invalid binarization method' do
82
+ expect do
83
+ described_class.new(binarization_method: 'invalid_method')
84
+ end.to raise_error ArgumentError, /Invalid binarization_method/
85
+ end
86
+
87
+ it 'accepts all valid binarization methods' do
88
+ valid_methods = %w[otsu sauvola niblack wolf bradley adaptive]
89
+
90
+ valid_methods.each do |method|
91
+ expect do
92
+ described_class.new(binarization_method: method)
93
+ end.not_to raise_error
94
+ end
95
+ end
96
+
97
+ it 'accepts binarization method as symbol' do
98
+ expect do
99
+ described_class.new(binarization_method: :sauvola)
100
+ end.not_to raise_error
101
+ end
102
+ end
103
+
104
+ describe 'keyword arguments' do
105
+ it 'accepts all keyword arguments' do
106
+ config = described_class.new(
107
+ target_dpi: 600,
108
+ auto_rotate: true,
109
+ deskew: false,
110
+ denoise: true,
111
+ contrast_enhance: false,
112
+ binarization_method: 'bradley',
113
+ invert_colors: true
114
+ )
115
+
116
+ expect(config.target_dpi).to eq 600
117
+ expect(config.auto_rotate).to be true
118
+ expect(config.deskew).to be false
119
+ expect(config.denoise).to be true
120
+ expect(config.contrast_enhance).to be false
121
+ expect(config.binarization_method).to eq 'bradley'
122
+ expect(config.invert_colors).to be true
123
+ end
124
+ end
125
+
126
+ describe 'equality' do
127
+ it 'compares configs by value' do
128
+ config1 = described_class.new(
129
+ target_dpi: 300,
130
+ binarization_method: 'otsu',
131
+ denoise: true
132
+ )
133
+ config2 = described_class.new(
134
+ target_dpi: 300,
135
+ binarization_method: 'otsu',
136
+ denoise: true
137
+ )
138
+
139
+ expect(config1.target_dpi).to eq config2.target_dpi
140
+ expect(config1.binarization_method).to eq config2.binarization_method
141
+ expect(config1.denoise).to eq config2.denoise
142
+ end
143
+
144
+ it 'detects differences in target_dpi' do
145
+ config1 = described_class.new(target_dpi: 300)
146
+ config2 = described_class.new(target_dpi: 600)
147
+
148
+ expect(config1.target_dpi).not_to eq config2.target_dpi
149
+ end
150
+
151
+ it 'detects differences in binarization_method' do
152
+ config1 = described_class.new(binarization_method: 'otsu')
153
+ config2 = described_class.new(binarization_method: 'sauvola')
154
+
155
+ expect(config1.binarization_method).not_to eq config2.binarization_method
156
+ end
157
+ end
158
+
159
+ describe 'nested config integration' do
160
+ it 'can be nested in Extraction config' do
161
+ preprocessing = described_class.new(target_dpi: 600, denoise: true)
162
+ extraction = Kreuzberg::Config::Extraction.new(image_preprocessing: preprocessing)
163
+
164
+ expect(extraction.image_preprocessing).to be_a described_class
165
+ expect(extraction.image_preprocessing.target_dpi).to eq 600
166
+ expect(extraction.image_preprocessing.denoise).to be true
167
+ end
168
+
169
+ it 'accepts hash in Extraction config' do
170
+ extraction = Kreuzberg::Config::Extraction.new(
171
+ image_preprocessing: { target_dpi: 600, binarization_method: 'sauvola' }
172
+ )
173
+
174
+ expect(extraction.image_preprocessing).to be_a described_class
175
+ expect(extraction.image_preprocessing.target_dpi).to eq 600
176
+ expect(extraction.image_preprocessing.binarization_method).to eq 'sauvola'
177
+ end
178
+
179
+ it 'can be nested in Tesseract config' do
180
+ preprocessing = described_class.new(denoise: true)
181
+ tesseract = Kreuzberg::Config::Tesseract.new(preprocessing: preprocessing)
182
+
183
+ expect(tesseract.options[:preprocessing]).to be_a described_class
184
+ expect(tesseract.options[:preprocessing].denoise).to be true
185
+ end
186
+ end
187
+
188
+ describe 'symbol vs string key handling' do
189
+ it 'converts symbol binarization method to string' do
190
+ config = described_class.new(binarization_method: :bradley)
191
+
192
+ expect(config.binarization_method).to eq 'bradley'
193
+ expect(config.binarization_method).to be_a String
194
+ end
195
+
196
+ it 'converts string target_dpi to integer' do
197
+ config = described_class.new(target_dpi: '600')
198
+
199
+ expect(config.target_dpi).to eq 600
200
+ expect(config.target_dpi).to be_a Integer
201
+ end
202
+ end
203
+
204
+ describe 'boolean conversion' do
205
+ it 'converts truthy values to boolean' do
206
+ config = described_class.new(
207
+ auto_rotate: 1,
208
+ deskew: 'yes',
209
+ denoise: true
210
+ )
211
+
212
+ expect(config.auto_rotate).to be true
213
+ expect(config.deskew).to be true
214
+ expect(config.denoise).to be true
215
+ end
216
+
217
+ it 'converts false values to boolean' do
218
+ config = described_class.new(
219
+ auto_rotate: false,
220
+ deskew: false,
221
+ denoise: false
222
+ )
223
+
224
+ expect(config.auto_rotate).to be false
225
+ expect(config.deskew).to be false
226
+ expect(config.denoise).to be false
227
+ end
228
+ end
229
+
230
+ describe 'DPI configuration' do
231
+ it 'accepts realistic DPI values' do
232
+ config = described_class.new(target_dpi: 300)
233
+
234
+ expect(config.target_dpi).to eq 300
235
+ end
236
+
237
+ it 'accepts high DPI values' do
238
+ config = described_class.new(target_dpi: 1200)
239
+
240
+ expect(config.target_dpi).to eq 1200
241
+ end
242
+
243
+ it 'accepts low DPI values' do
244
+ config = described_class.new(target_dpi: 72)
245
+
246
+ expect(config.target_dpi).to eq 72
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,229 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::Keywords do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.algorithm).to be_nil
9
+ expect(config.max_keywords).to be_nil
10
+ expect(config.min_score).to be_nil
11
+ expect(config.ngram_range).to be_nil
12
+ expect(config.language).to be_nil
13
+ expect(config.yake_params).to be_nil
14
+ expect(config.rake_params).to be_nil
15
+ end
16
+
17
+ it 'creates config with custom values' do
18
+ config = described_class.new(
19
+ algorithm: 'yake',
20
+ max_keywords: 10,
21
+ min_score: 0.5,
22
+ ngram_range: [1, 3],
23
+ language: 'en'
24
+ )
25
+
26
+ expect(config.algorithm).to eq 'yake'
27
+ expect(config.max_keywords).to eq 10
28
+ expect(config.min_score).to eq 0.5
29
+ expect(config.ngram_range).to eq [1, 3]
30
+ expect(config.language).to eq 'en'
31
+ end
32
+
33
+ it 'accepts yake_params as instance' do
34
+ yake_params = Kreuzberg::Config::KeywordYakeParams.new(window_size: 3)
35
+ config = described_class.new(yake_params: yake_params)
36
+
37
+ expect(config.yake_params).to be_a Kreuzberg::Config::KeywordYakeParams
38
+ expect(config.yake_params.window_size).to eq 3
39
+ end
40
+
41
+ it 'converts yake_params hash to instance' do
42
+ config = described_class.new(yake_params: { window_size: 2 })
43
+
44
+ expect(config.yake_params).to be_a Kreuzberg::Config::KeywordYakeParams
45
+ expect(config.yake_params.window_size).to eq 2
46
+ end
47
+
48
+ it 'accepts rake_params as instance' do
49
+ rake_params = Kreuzberg::Config::KeywordRakeParams.new(min_word_length: 3)
50
+ config = described_class.new(rake_params: rake_params)
51
+
52
+ expect(config.rake_params).to be_a Kreuzberg::Config::KeywordRakeParams
53
+ end
54
+
55
+ it 'converts rake_params hash to instance' do
56
+ config = described_class.new(rake_params: { min_word_length: 2 })
57
+
58
+ expect(config.rake_params).to be_a Kreuzberg::Config::KeywordRakeParams
59
+ expect(config.rake_params.min_word_length).to eq 2
60
+ end
61
+ end
62
+
63
+ describe '#to_h' do
64
+ it 'serializes to hash' do
65
+ config = described_class.new(algorithm: 'yake', max_keywords: 10)
66
+ hash = config.to_h
67
+
68
+ expect(hash).to be_a Hash
69
+ expect(hash[:algorithm]).to eq 'yake'
70
+ expect(hash[:max_keywords]).to eq 10
71
+ end
72
+
73
+ it 'includes nested params in hash' do
74
+ config = described_class.new(
75
+ algorithm: 'yake',
76
+ yake_params: { window_size: 3 }
77
+ )
78
+ hash = config.to_h
79
+
80
+ expect(hash[:yake_params]).to be_a Hash
81
+ expect(hash[:yake_params][:window_size]).to eq 3
82
+ end
83
+
84
+ it 'compacts nil values from hash' do
85
+ config = described_class.new(algorithm: 'rake')
86
+ hash = config.to_h
87
+
88
+ expect(hash.key?(:max_keywords)).to be false
89
+ expect(hash.key?(:yake_params)).to be false
90
+ end
91
+ end
92
+
93
+ describe 'validation' do
94
+ it 'accepts valid algorithm names' do
95
+ expect do
96
+ described_class.new(algorithm: 'yake')
97
+ end.not_to raise_error
98
+ end
99
+
100
+ it 'accepts valid max_keywords' do
101
+ expect do
102
+ described_class.new(max_keywords: 20)
103
+ end.not_to raise_error
104
+ end
105
+
106
+ it 'raises error for invalid yake_params type' do
107
+ expect do
108
+ described_class.new(yake_params: 'invalid')
109
+ end.to raise_error ArgumentError, /Expected.*KeywordYakeParams.*Hash.*nil/
110
+ end
111
+
112
+ it 'raises error for invalid rake_params type' do
113
+ expect do
114
+ described_class.new(rake_params: 'invalid')
115
+ end.to raise_error ArgumentError, /Expected.*KeywordRakeParams.*Hash.*nil/
116
+ end
117
+ end
118
+
119
+ describe 'keyword arguments' do
120
+ it 'accepts all keyword arguments' do
121
+ config = described_class.new(
122
+ algorithm: 'yake',
123
+ max_keywords: 15,
124
+ min_score: 0.7,
125
+ ngram_range: [1, 2],
126
+ language: 'fr',
127
+ yake_params: { window_size: 3 }
128
+ )
129
+
130
+ expect(config.algorithm).to eq 'yake'
131
+ expect(config.max_keywords).to eq 15
132
+ expect(config.min_score).to eq 0.7
133
+ expect(config.ngram_range).to eq [1, 2]
134
+ expect(config.language).to eq 'fr'
135
+ expect(config.yake_params).to be_a Kreuzberg::Config::KeywordYakeParams
136
+ end
137
+ end
138
+
139
+ describe 'equality' do
140
+ it 'compares configs by value' do
141
+ config1 = described_class.new(algorithm: 'yake', max_keywords: 10)
142
+ config2 = described_class.new(algorithm: 'yake', max_keywords: 10)
143
+
144
+ expect(config1.algorithm).to eq config2.algorithm
145
+ expect(config1.max_keywords).to eq config2.max_keywords
146
+ end
147
+
148
+ it 'detects differences in algorithm' do
149
+ config1 = described_class.new(algorithm: 'yake')
150
+ config2 = described_class.new(algorithm: 'rake')
151
+
152
+ expect(config1.algorithm).not_to eq config2.algorithm
153
+ end
154
+
155
+ it 'detects differences in max_keywords' do
156
+ config1 = described_class.new(max_keywords: 10)
157
+ config2 = described_class.new(max_keywords: 20)
158
+
159
+ expect(config1.max_keywords).not_to eq config2.max_keywords
160
+ end
161
+ end
162
+
163
+ describe 'nested config integration' do
164
+ it 'can be nested in Extraction config' do
165
+ keywords = described_class.new(algorithm: 'yake', max_keywords: 15)
166
+ extraction = Kreuzberg::Config::Extraction.new(keywords: keywords)
167
+
168
+ expect(extraction.keywords).to be_a described_class
169
+ expect(extraction.keywords.algorithm).to eq 'yake'
170
+ expect(extraction.keywords.max_keywords).to eq 15
171
+ end
172
+
173
+ it 'accepts hash in Extraction config' do
174
+ extraction = Kreuzberg::Config::Extraction.new(
175
+ keywords: { algorithm: 'rake', max_keywords: 10 }
176
+ )
177
+
178
+ expect(extraction.keywords).to be_a described_class
179
+ expect(extraction.keywords.algorithm).to eq 'rake'
180
+ expect(extraction.keywords.max_keywords).to eq 10
181
+ end
182
+ end
183
+
184
+ describe 'symbol vs string key handling' do
185
+ it 'converts symbol algorithm to string' do
186
+ config = described_class.new(algorithm: :yake)
187
+
188
+ expect(config.algorithm).to eq 'yake'
189
+ expect(config.algorithm).to be_a String
190
+ end
191
+
192
+ it 'converts symbol language to string' do
193
+ config = described_class.new(language: :eng)
194
+
195
+ expect(config.language).to eq 'eng'
196
+ expect(config.language).to be_a String
197
+ end
198
+
199
+ it 'converts ngram_range values to integers' do
200
+ config = described_class.new(ngram_range: %w[1 3])
201
+
202
+ expect(config.ngram_range).to eq [1, 3]
203
+ expect(config.ngram_range.all?(Integer)).to be true
204
+ end
205
+ end
206
+
207
+ describe 'parameter conversions' do
208
+ it 'converts max_keywords to integer' do
209
+ config = described_class.new(max_keywords: '20')
210
+
211
+ expect(config.max_keywords).to eq 20
212
+ expect(config.max_keywords).to be_a Integer
213
+ end
214
+
215
+ it 'converts min_score to float' do
216
+ config = described_class.new(min_score: '0.75')
217
+
218
+ expect(config.min_score).to eq 0.75
219
+ expect(config.min_score).to be_a Float
220
+ end
221
+
222
+ it 'converts ngram_range to array of integers' do
223
+ config = described_class.new(ngram_range: [1, 2])
224
+
225
+ expect(config.ngram_range).to eq [1, 2]
226
+ expect(config.ngram_range.all?(Integer)).to be true
227
+ end
228
+ end
229
+ end