kreuzberg 4.0.0.rc2 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +391 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,691 +1,936 @@
1
- # frozen_string_literal: true
2
-
3
- module Kreuzberg
4
- module Config
5
- # OCR configuration
6
- #
7
- # @example
8
- # ocr = OCR.new(backend: "tesseract", language: "eng")
9
- #
10
- class OCR
11
- attr_reader :backend, :language, :tesseract_config
12
-
13
- def initialize(
14
- backend: 'tesseract',
15
- language: 'eng',
16
- tesseract_config: nil
17
- )
18
- @backend = backend.to_s
19
- @language = language.to_s
20
- @tesseract_config = normalize_tesseract_config(tesseract_config)
21
- end
22
-
23
- def to_h
24
- {
25
- backend: @backend,
26
- language: @language,
27
- tesseract_config: @tesseract_config&.to_h
28
- }.compact
29
- end
30
-
31
- private
32
-
33
- def normalize_tesseract_config(value)
34
- return nil if value.nil?
35
- return value if value.is_a?(Tesseract)
36
- return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
37
-
38
- raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
39
- end
40
- end
41
-
42
- # Tesseract OCR engine configuration
43
- class Tesseract
44
- attr_reader :options
45
-
46
- def initialize(**options)
47
- @options = options.transform_keys(&:to_sym)
48
- normalize_nested_preprocessing!
49
- end
50
-
51
- def to_h
52
- @options.dup
53
- end
54
-
55
- private
56
-
57
- def normalize_nested_preprocessing!
58
- preprocessing = @options[:preprocessing]
59
- return if preprocessing.nil?
60
- return if preprocessing.is_a?(ImagePreprocessing)
61
- return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
62
- preprocessing.is_a?(Hash)
63
-
64
- raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
65
- end
66
- end
67
-
68
- # Chunking configuration
69
- #
70
- # @example
71
- # chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
72
- #
73
- class Chunking
74
- attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
75
-
76
- def initialize(
77
- max_chars: nil,
78
- max_overlap: nil,
79
- preset: nil,
80
- embedding: nil,
81
- chunk_size: nil,
82
- chunk_overlap: nil,
83
- enabled: true
84
- )
85
- resolved_size = chunk_size || max_chars || 1000
86
- resolved_overlap = chunk_overlap || max_overlap || 200
87
-
88
- @max_chars = resolved_size.to_i
89
- @max_overlap = resolved_overlap.to_i
90
- @preset = preset&.to_s
91
- @embedding = normalize_embedding(embedding)
92
- @enabled = boolean_or_nil(enabled)
93
- end
94
-
95
- def to_h
96
- config = {
97
- max_chars: @max_chars,
98
- max_overlap: @max_overlap,
99
- preset: @preset,
100
- embedding: @embedding&.to_h
101
- }.compact
102
- # @type var config: Hash[Symbol, untyped]
103
- config[:enabled] = @enabled unless @enabled.nil?
104
- config
105
- end
106
-
107
- private
108
-
109
- def normalize_embedding(value)
110
- return nil if value.nil?
111
- return value if value.is_a?(Embedding)
112
- return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
113
-
114
- raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
115
- end
116
-
117
- def boolean_or_nil(value)
118
- return nil if value.nil?
119
-
120
- value ? true : false
121
- end
122
- end
123
-
124
- # Embedding model configuration for document chunking
125
- class Embedding
126
- attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
127
-
128
- def initialize(
129
- model: { type: :preset, name: 'balanced' },
130
- normalize: true,
131
- batch_size: 32,
132
- show_download_progress: false,
133
- cache_dir: nil
134
- )
135
- @model = normalize_model(model)
136
- @normalize = boolean_or_nil(normalize)
137
- @batch_size = batch_size&.to_i
138
- @show_download_progress = boolean_or_nil(show_download_progress)
139
- @cache_dir = cache_dir&.to_s
140
- end
141
-
142
- def to_h
143
- {
144
- model: @model,
145
- normalize: @normalize,
146
- batch_size: @batch_size,
147
- show_download_progress: @show_download_progress,
148
- cache_dir: @cache_dir
149
- }.compact
150
- end
151
-
152
- private
153
-
154
- def normalize_model(model)
155
- normalized = if model.respond_to?(:to_h)
156
- model.to_h
157
- else
158
- model
159
- end
160
- raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
161
-
162
- normalized.transform_keys(&:to_sym)
163
- end
164
-
165
- def boolean_or_nil(value)
166
- return nil if value.nil?
167
-
168
- value ? true : false
169
- end
170
- end
171
-
172
- # Language detection configuration
173
- #
174
- # @example
175
- # lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
176
- #
177
- class LanguageDetection
178
- attr_reader :enabled, :min_confidence, :detect_multiple
179
-
180
- def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
181
- @enabled = enabled ? true : false
182
- @min_confidence = min_confidence.to_f
183
- @detect_multiple = detect_multiple ? true : false
184
- end
185
-
186
- def to_h
187
- {
188
- enabled: @enabled,
189
- min_confidence: @min_confidence,
190
- detect_multiple: @detect_multiple
191
- }
192
- end
193
- end
194
-
195
- # PDF-specific options
196
- #
197
- # @example
198
- # pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
199
- #
200
- class PDF
201
- attr_reader :extract_images, :passwords, :extract_metadata
202
-
203
- def initialize(
204
- extract_images: false,
205
- passwords: nil,
206
- extract_metadata: true
207
- )
208
- @extract_images = extract_images ? true : false
209
- @passwords = if passwords.is_a?(Array)
210
- passwords.map(&:to_s)
211
- else
212
- (passwords ? [passwords.to_s] : nil)
213
- end
214
- @extract_metadata = extract_metadata ? true : false
215
- end
216
-
217
- def to_h
218
- {
219
- extract_images: @extract_images,
220
- passwords: @passwords,
221
- extract_metadata: @extract_metadata
222
- }.compact
223
- end
224
- end
225
-
226
- # Image extraction configuration
227
- #
228
- # @example
229
- # image = ImageExtraction.new(extract_images: true, target_dpi: 300)
230
- #
231
- # @example With auto-adjust DPI
232
- # image = ImageExtraction.new(
233
- # extract_images: true,
234
- # auto_adjust_dpi: true,
235
- # min_dpi: 150,
236
- # max_dpi: 600
237
- # )
238
- #
239
- class ImageExtraction
240
- attr_reader :extract_images, :target_dpi, :max_image_dimension,
241
- :auto_adjust_dpi, :min_dpi, :max_dpi
242
-
243
- def initialize(
244
- extract_images: true,
245
- target_dpi: 300,
246
- max_image_dimension: 2000,
247
- auto_adjust_dpi: true,
248
- min_dpi: 150,
249
- max_dpi: 600
250
- )
251
- @extract_images = extract_images ? true : false
252
- @target_dpi = target_dpi.to_i
253
- @max_image_dimension = max_image_dimension.to_i
254
- @auto_adjust_dpi = auto_adjust_dpi ? true : false
255
- @min_dpi = min_dpi.to_i
256
- @max_dpi = max_dpi.to_i
257
- end
258
-
259
- def to_h
260
- {
261
- extract_images: @extract_images,
262
- target_dpi: @target_dpi,
263
- max_image_dimension: @max_image_dimension,
264
- auto_adjust_dpi: @auto_adjust_dpi,
265
- min_dpi: @min_dpi,
266
- max_dpi: @max_dpi
267
- }
268
- end
269
- end
270
-
271
- # Image preprocessing configuration for OCR
272
- #
273
- # @example Basic preprocessing
274
- # preprocessing = ImagePreprocessing.new(
275
- # binarization_method: "otsu",
276
- # denoise: true
277
- # )
278
- #
279
- # @example Advanced preprocessing
280
- # preprocessing = ImagePreprocessing.new(
281
- # target_dpi: 600,
282
- # auto_rotate: true,
283
- # deskew: true,
284
- # denoise: true,
285
- # contrast_enhance: true,
286
- # binarization_method: "sauvola",
287
- # invert_colors: false
288
- # )
289
- #
290
- class ImagePreprocessing
291
- attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
292
- :contrast_enhance, :binarization_method, :invert_colors
293
-
294
- def initialize(
295
- target_dpi: 300,
296
- auto_rotate: true,
297
- deskew: true,
298
- denoise: false,
299
- contrast_enhance: true,
300
- binarization_method: 'otsu',
301
- invert_colors: false
302
- )
303
- @target_dpi = target_dpi.to_i
304
- @auto_rotate = auto_rotate ? true : false
305
- @deskew = deskew ? true : false
306
- @denoise = denoise ? true : false
307
- @contrast_enhance = contrast_enhance ? true : false
308
- @binarization_method = binarization_method.to_s
309
- @invert_colors = invert_colors ? true : false
310
-
311
- valid_methods = %w[otsu sauvola adaptive]
312
- return if valid_methods.include?(@binarization_method)
313
-
314
- raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
315
- end
316
-
317
- def to_h
318
- {
319
- target_dpi: @target_dpi,
320
- auto_rotate: @auto_rotate,
321
- deskew: @deskew,
322
- denoise: @denoise,
323
- contrast_enhance: @contrast_enhance,
324
- binarization_method: @binarization_method,
325
- invert_colors: @invert_colors
326
- }
327
- end
328
- end
329
-
330
- # Token reduction configuration
331
- #
332
- # @example Disable token reduction
333
- # token = TokenReduction.new(mode: "off")
334
- #
335
- # @example Light reduction
336
- # token = TokenReduction.new(mode: "light", preserve_important_words: true)
337
- #
338
- # @example Aggressive reduction
339
- # token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
340
- #
341
- class TokenReduction
342
- attr_reader :mode, :preserve_important_words
343
-
344
- def initialize(mode: 'off', preserve_important_words: true)
345
- @mode = mode.to_s
346
- @preserve_important_words = preserve_important_words ? true : false
347
-
348
- valid_modes = %w[off light moderate aggressive maximum]
349
- return if valid_modes.include?(@mode)
350
-
351
- raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
352
- end
353
-
354
- def to_h
355
- {
356
- mode: @mode,
357
- preserve_important_words: @preserve_important_words
358
- }
359
- end
360
- end
361
-
362
- # HTML preprocessing configuration for content extraction
363
- class HtmlPreprocessing
364
- attr_reader :enabled, :preset, :remove_navigation, :remove_forms
365
-
366
- def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
367
- @enabled = boolean_or_nil(enabled)
368
- @preset = preset&.to_sym
369
- @remove_navigation = boolean_or_nil(remove_navigation)
370
- @remove_forms = boolean_or_nil(remove_forms)
371
- end
372
-
373
- def to_h
374
- {
375
- enabled: @enabled,
376
- preset: @preset,
377
- remove_navigation: @remove_navigation,
378
- remove_forms: @remove_forms
379
- }.compact
380
- end
381
-
382
- private
383
-
384
- def boolean_or_nil(value)
385
- return nil if value.nil?
386
-
387
- value ? true : false
388
- end
389
- end
390
-
391
- # HTML rendering options for document conversion
392
- class HtmlOptions
393
- attr_reader :options
394
-
395
- def initialize(**options)
396
- normalized = options.transform_keys(&:to_sym)
397
- symbol_keys = %i[
398
- heading_style
399
- code_block_style
400
- highlight_style
401
- list_indent_type
402
- newline_style
403
- whitespace_mode
404
- ]
405
- symbol_keys.each do |key|
406
- normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
407
- end
408
- if normalized[:preprocessing].is_a?(Hash)
409
- normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
410
- end
411
- @options = normalized
412
- end
413
-
414
- def to_h
415
- @options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
416
- end
417
- end
418
-
419
- # YAKE keyword extraction parameters
420
- class KeywordYakeParams
421
- attr_reader :window_size
422
-
423
- def initialize(window_size: 2)
424
- @window_size = window_size.to_i
425
- end
426
-
427
- def to_h
428
- { window_size: @window_size }
429
- end
430
- end
431
-
432
- # RAKE keyword extraction parameters
433
- class KeywordRakeParams
434
- attr_reader :min_word_length, :max_words_per_phrase
435
-
436
- def initialize(min_word_length: 1, max_words_per_phrase: 3)
437
- @min_word_length = min_word_length.to_i
438
- @max_words_per_phrase = max_words_per_phrase.to_i
439
- end
440
-
441
- def to_h
442
- {
443
- min_word_length: @min_word_length,
444
- max_words_per_phrase: @max_words_per_phrase
445
- }
446
- end
447
- end
448
-
449
- # Keyword extraction configuration for document analysis
450
- class Keywords
451
- attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
452
- :language, :yake_params, :rake_params
453
-
454
- def initialize(
455
- algorithm: nil,
456
- max_keywords: nil,
457
- min_score: nil,
458
- ngram_range: nil,
459
- language: nil,
460
- yake_params: nil,
461
- rake_params: nil
462
- )
463
- @algorithm = algorithm&.to_s
464
- @max_keywords = max_keywords&.to_i
465
- @min_score = min_score&.to_f
466
- @ngram_range = ngram_range&.map(&:to_i)
467
- @language = language&.to_s
468
- @yake_params = normalize_nested(yake_params, KeywordYakeParams)
469
- @rake_params = normalize_nested(rake_params, KeywordRakeParams)
470
- end
471
-
472
- def to_h
473
- {
474
- algorithm: @algorithm,
475
- max_keywords: @max_keywords,
476
- min_score: @min_score,
477
- ngram_range: @ngram_range,
478
- language: @language,
479
- yake_params: @yake_params&.to_h,
480
- rake_params: @rake_params&.to_h
481
- }.compact
482
- end
483
-
484
- private
485
-
486
- def normalize_nested(value, klass)
487
- return nil if value.nil?
488
- return value if value.is_a?(klass)
489
- return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
490
-
491
- raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
492
- end
493
- end
494
-
495
- # Post-processor configuration
496
- #
497
- # @example Enable all post-processors
498
- # postprocessor = PostProcessor.new(enabled: true)
499
- #
500
- # @example Enable specific processors
501
- # postprocessor = PostProcessor.new(
502
- # enabled: true,
503
- # enabled_processors: ["quality", "formatting"]
504
- # )
505
- #
506
- # @example Disable specific processors
507
- # postprocessor = PostProcessor.new(
508
- # enabled: true,
509
- # disabled_processors: ["token_reduction"]
510
- # )
511
- #
512
- class PostProcessor
513
- attr_reader :enabled, :enabled_processors, :disabled_processors
514
-
515
- def initialize(
516
- enabled: true,
517
- enabled_processors: nil,
518
- disabled_processors: nil
519
- )
520
- @enabled = enabled ? true : false
521
- @enabled_processors = enabled_processors&.map(&:to_s)
522
- @disabled_processors = disabled_processors&.map(&:to_s)
523
- end
524
-
525
- def to_h
526
- {
527
- enabled: @enabled,
528
- enabled_processors: @enabled_processors,
529
- disabled_processors: @disabled_processors
530
- }.compact
531
- end
532
- end
533
-
534
- # Main extraction configuration
535
- #
536
- # @example Basic usage
537
- # config = Extraction.new(use_cache: true, force_ocr: true)
538
- #
539
- # @example With OCR
540
- # ocr = Config::OCR.new(backend: "tesseract", language: "eng")
541
- # config = Extraction.new(ocr: ocr)
542
- #
543
- # @example With image extraction
544
- # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
545
- # config = Extraction.new(image_extraction: image)
546
- #
547
- # @example With preprocessing
548
- # preprocessing = Config::ImagePreprocessing.new(
549
- # binarization_method: "sauvola",
550
- # denoise: true
551
- # )
552
- # config = Extraction.new(image_preprocessing: preprocessing)
553
- #
554
- # @example With post-processing
555
- # postprocessor = Config::PostProcessor.new(
556
- # enabled: true,
557
- # enabled_processors: ["quality"]
558
- # )
559
- # config = Extraction.new(postprocessor: postprocessor)
560
- #
561
- # @example With all options
562
- # config = Extraction.new(
563
- # use_cache: true,
564
- # enable_quality_processing: true,
565
- # force_ocr: false,
566
- # ocr: Config::OCR.new(language: "deu"),
567
- # chunking: Config::Chunking.new(max_chars: 500),
568
- # language_detection: Config::LanguageDetection.new(enabled: true),
569
- # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
570
- # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
571
- # image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
572
- # postprocessor: Config::PostProcessor.new(enabled: true)
573
- # )
574
- #
575
- class Extraction
576
- attr_reader :use_cache, :enable_quality_processing, :force_ocr,
577
- :ocr, :chunking, :language_detection, :pdf_options,
578
- :image_extraction, :image_preprocessing, :postprocessor,
579
- :token_reduction, :keywords, :html_options,
580
- :max_concurrent_extractions
581
-
582
- # Load configuration from a file.
583
- #
584
- # Detects the file format from the extension (.toml, .yaml, .json)
585
- # and loads the configuration accordingly.
586
- #
587
- # @param path [String] Path to the configuration file
588
- # @return [Kreuzberg::Config::Extraction] Loaded configuration object
589
- #
590
- # @example Load from TOML
591
- # config = Kreuzberg::Config::Extraction.from_file("config.toml")
592
- #
593
- # @example Load from YAML
594
- # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
595
- #
596
- def self.from_file(path)
597
- hash = Kreuzberg._config_from_file_native(path)
598
- # Convert string keys to symbols for keyword arguments
599
- new(**hash.transform_keys(&:to_sym))
600
- end
601
-
602
- # Discover configuration file in current or parent directories.
603
- #
604
- # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
605
- # directory and parent directories.
606
- #
607
- # @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
608
- #
609
- # @example
610
- # config = Kreuzberg::Config::Extraction.discover
611
- # if config
612
- # # Use discovered config
613
- # end
614
- #
615
- def self.discover
616
- hash = Kreuzberg._config_discover_native
617
- return nil if hash.nil?
618
-
619
- # Convert string keys to symbols for keyword arguments
620
- new(**hash.transform_keys(&:to_sym))
621
- end
622
-
623
- def initialize(
624
- use_cache: true,
625
- enable_quality_processing: false,
626
- force_ocr: false,
627
- ocr: nil,
628
- chunking: nil,
629
- language_detection: nil,
630
- pdf_options: nil,
631
- image_extraction: nil,
632
- image_preprocessing: nil,
633
- postprocessor: nil,
634
- token_reduction: nil,
635
- keywords: nil,
636
- html_options: nil,
637
- max_concurrent_extractions: nil
638
- )
639
- @use_cache = use_cache ? true : false
640
- @enable_quality_processing = enable_quality_processing ? true : false
641
- @force_ocr = force_ocr ? true : false
642
- @ocr = normalize_config(ocr, OCR)
643
- @chunking = normalize_config(chunking, Chunking)
644
- @language_detection = normalize_config(language_detection, LanguageDetection)
645
- @pdf_options = normalize_config(pdf_options, PDF)
646
- @image_extraction = normalize_config(image_extraction, ImageExtraction)
647
- @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
648
- @postprocessor = normalize_config(postprocessor, PostProcessor)
649
- @token_reduction = normalize_config(token_reduction, TokenReduction)
650
- @keywords = normalize_config(keywords, Keywords)
651
- @html_options = normalize_config(html_options, HtmlOptions)
652
- @max_concurrent_extractions = max_concurrent_extractions&.to_i
653
- end
654
-
655
- # rubocop:disable Metrics/CyclomaticComplexity
656
- def to_h
657
- {
658
- use_cache: @use_cache,
659
- enable_quality_processing: @enable_quality_processing,
660
- force_ocr: @force_ocr,
661
- ocr: @ocr&.to_h,
662
- chunking: @chunking&.to_h,
663
- language_detection: @language_detection&.to_h,
664
- pdf_options: @pdf_options&.to_h,
665
- image_extraction: @image_extraction&.to_h,
666
- image_preprocessing: @image_preprocessing&.to_h,
667
- postprocessor: @postprocessor&.to_h,
668
- token_reduction: @token_reduction&.to_h,
669
- keywords: @keywords&.to_h,
670
- html_options: @html_options&.to_h,
671
- max_concurrent_extractions: @max_concurrent_extractions
672
- }.compact
673
- end
674
- # rubocop:enable Metrics/CyclomaticComplexity
675
-
676
- private
677
-
678
- def normalize_config(value, klass)
679
- return nil if value.nil?
680
- return value if value.is_a?(klass)
681
- # Convert string keys to symbols for keyword arguments
682
- return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
683
-
684
- raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
685
- end
686
- end
687
-
688
- # Backwards compatibility aliases
689
- Ocr = OCR
690
- end
691
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Kreuzberg
6
+ module Config
7
+ # @example
8
+ class OCR
9
+ attr_reader :backend, :language, :tesseract_config
10
+
11
+ def initialize(
12
+ backend: 'tesseract',
13
+ language: 'eng',
14
+ tesseract_config: nil
15
+ )
16
+ @backend = backend.to_s
17
+ @language = language.to_s
18
+ @tesseract_config = normalize_tesseract_config(tesseract_config)
19
+ end
20
+
21
+ def to_h
22
+ {
23
+ backend: @backend,
24
+ language: @language,
25
+ tesseract_config: @tesseract_config&.to_h
26
+ }.compact
27
+ end
28
+
29
+ private
30
+
31
+ def normalize_tesseract_config(value)
32
+ return nil if value.nil?
33
+ return value if value.is_a?(Tesseract)
34
+ return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
35
+
36
+ raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
37
+ end
38
+ end
39
+
40
+ # Tesseract OCR engine configuration
41
+ class Tesseract
42
+ attr_reader :options
43
+
44
+ def initialize(**options)
45
+ @options = options.transform_keys(&:to_sym)
46
+ normalize_nested_preprocessing!
47
+ end
48
+
49
+ def to_h
50
+ @options.dup
51
+ end
52
+
53
+ private
54
+
55
+ def normalize_nested_preprocessing!
56
+ preprocessing = @options[:preprocessing]
57
+ return if preprocessing.nil?
58
+ return if preprocessing.is_a?(ImagePreprocessing)
59
+ return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
60
+ preprocessing.is_a?(Hash)
61
+
62
+ raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
63
+ end
64
+ end
65
+
66
+ # Chunking configuration
67
+ #
68
+ # @example
69
+ # chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
70
+ #
71
+ class Chunking
72
+ attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
73
+
74
+ # rubocop:disable Metrics/CyclomaticComplexity
75
+ def initialize(
76
+ max_chars: nil,
77
+ max_overlap: nil,
78
+ preset: nil,
79
+ embedding: nil,
80
+ chunk_size: nil,
81
+ chunk_overlap: nil,
82
+ enabled: true
83
+ )
84
+ # rubocop:enable Metrics/CyclomaticComplexity
85
+ resolved_size = chunk_size || max_chars || 1000
86
+ resolved_overlap = chunk_overlap || max_overlap || 200
87
+
88
+ @max_chars = resolved_size.to_i
89
+ @max_overlap = resolved_overlap.to_i
90
+
91
+ # Validate positive values
92
+ raise ArgumentError, "max_chars must be a positive integer, got #{@max_chars}" if @max_chars.negative?
93
+ raise ArgumentError, "max_overlap must be a positive integer, got #{@max_overlap}" if @max_overlap.negative?
94
+
95
+ @preset = preset&.to_s
96
+ @embedding = normalize_embedding(embedding)
97
+ @enabled = boolean_or_nil(enabled)
98
+ end
99
+
100
+ def to_h
101
+ config = {
102
+ max_chars: @max_chars,
103
+ max_overlap: @max_overlap,
104
+ preset: @preset,
105
+ embedding: @embedding&.to_h
106
+ }.compact
107
+ # @type var config: Hash[Symbol, untyped]
108
+ config[:enabled] = @enabled unless @enabled.nil?
109
+ config
110
+ end
111
+
112
+ private
113
+
114
+ def normalize_embedding(value)
115
+ return nil if value.nil?
116
+ return value if value.is_a?(Embedding)
117
+ return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
118
+
119
+ raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
120
+ end
121
+
122
+ def boolean_or_nil(value)
123
+ return nil if value.nil?
124
+
125
+ value ? true : false
126
+ end
127
+ end
128
+
129
+ # Embedding model configuration for document chunking
130
+ class Embedding
131
+ attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
132
+
133
+ def initialize(
134
+ model: { type: :preset, name: 'balanced' },
135
+ normalize: true,
136
+ batch_size: 32,
137
+ show_download_progress: false,
138
+ cache_dir: nil
139
+ )
140
+ @model = normalize_model(model)
141
+ @normalize = boolean_or_nil(normalize)
142
+ @batch_size = batch_size&.to_i
143
+ @show_download_progress = boolean_or_nil(show_download_progress)
144
+ @cache_dir = cache_dir&.to_s
145
+ end
146
+
147
+ def to_h
148
+ {
149
+ model: @model,
150
+ normalize: @normalize,
151
+ batch_size: @batch_size,
152
+ show_download_progress: @show_download_progress,
153
+ cache_dir: @cache_dir
154
+ }.compact
155
+ end
156
+
157
+ private
158
+
159
+ def normalize_model(model)
160
+ normalized = if model.respond_to?(:to_h)
161
+ model.to_h
162
+ else
163
+ model
164
+ end
165
+ raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
166
+
167
+ normalized.transform_keys(&:to_sym)
168
+ end
169
+
170
+ def boolean_or_nil(value)
171
+ return nil if value.nil?
172
+
173
+ value ? true : false
174
+ end
175
+ end
176
+
177
+ # Language detection configuration
178
+ #
179
+ # @example
180
+ # lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
181
+ #
182
+ class LanguageDetection
183
+ attr_reader :enabled, :min_confidence, :detect_multiple
184
+
185
+ def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
186
+ @enabled = enabled ? true : false
187
+ @min_confidence = min_confidence.to_f
188
+ @detect_multiple = detect_multiple ? true : false
189
+ end
190
+
191
+ def to_h
192
+ {
193
+ enabled: @enabled,
194
+ min_confidence: @min_confidence,
195
+ detect_multiple: @detect_multiple
196
+ }
197
+ end
198
+ end
199
+
200
+ # Font configuration for PDF rendering
201
+ #
202
+ # @example
203
+ # font_config = FontConfig.new(enabled: true, custom_font_dirs: ["/usr/share/fonts"])
204
+ #
205
+ class FontConfig
206
+ attr_accessor :enabled, :custom_font_dirs
207
+
208
+ def initialize(enabled: true, custom_font_dirs: nil)
209
+ @enabled = enabled ? true : false
210
+ @custom_font_dirs = custom_font_dirs
211
+ end
212
+
213
+ def to_h
214
+ {
215
+ enabled: @enabled,
216
+ custom_font_dirs: @custom_font_dirs
217
+ }.compact
218
+ end
219
+ end
220
+
221
+ # Hierarchy detection configuration
222
+ #
223
+ # @example
224
+ # hierarchy = Hierarchy.new(enabled: true, k_clusters: 6, include_bbox: true)
225
+ #
226
+ class Hierarchy
227
+ attr_reader :enabled, :k_clusters, :include_bbox, :ocr_coverage_threshold
228
+
229
+ def initialize(
230
+ enabled: true,
231
+ k_clusters: 6,
232
+ include_bbox: true,
233
+ ocr_coverage_threshold: nil
234
+ )
235
+ @enabled = enabled ? true : false
236
+ @k_clusters = k_clusters&.to_i || 6
237
+ @include_bbox = include_bbox ? true : false
238
+ @ocr_coverage_threshold = ocr_coverage_threshold&.to_f
239
+ end
240
+
241
+ def to_h
242
+ {
243
+ enabled: @enabled,
244
+ k_clusters: @k_clusters,
245
+ include_bbox: @include_bbox,
246
+ ocr_coverage_threshold: @ocr_coverage_threshold
247
+ }.compact
248
+ end
249
+
250
+ def self.from_h(hash)
251
+ return nil if hash.nil?
252
+ return hash if hash.is_a?(self)
253
+
254
+ new(**hash.transform_keys(&:to_sym)) if hash.is_a?(Hash)
255
+ end
256
+ end
257
+
258
+ # PDF-specific options
259
+ #
260
+ # @example
261
+ # pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
262
+ #
263
+ # @example With font configuration
264
+ # font_config = FontConfig.new(enabled: true, custom_font_dirs: ["/usr/share/fonts"])
265
+ # pdf = PDF.new(extract_images: true, font_config: font_config)
266
+ #
267
+ # @example With hierarchy configuration
268
+ # hierarchy = Hierarchy.new(enabled: true, k_clusters: 6)
269
+ # pdf = PDF.new(extract_images: true, hierarchy: hierarchy)
270
+ #
271
+ class PDF
272
+ attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy
273
+
274
+ def initialize(
275
+ extract_images: false,
276
+ passwords: nil,
277
+ extract_metadata: true,
278
+ font_config: nil,
279
+ hierarchy: nil
280
+ )
281
+ @extract_images = extract_images ? true : false
282
+ @passwords = if passwords.is_a?(Array)
283
+ passwords.map(&:to_s)
284
+ else
285
+ (passwords ? [passwords.to_s] : nil)
286
+ end
287
+ @extract_metadata = extract_metadata ? true : false
288
+ @font_config = normalize_font_config(font_config)
289
+ @hierarchy = normalize_hierarchy(hierarchy)
290
+ end
291
+
292
+ def to_h
293
+ {
294
+ extract_images: @extract_images,
295
+ passwords: @passwords,
296
+ extract_metadata: @extract_metadata,
297
+ font_config: @font_config&.to_h,
298
+ hierarchy: @hierarchy&.to_h
299
+ }.compact
300
+ end
301
+
302
+ def font_config=(value)
303
+ @font_config = normalize_font_config(value)
304
+ end
305
+
306
+ def hierarchy=(value)
307
+ @hierarchy = normalize_hierarchy(value)
308
+ end
309
+
310
+ private
311
+
312
+ def normalize_font_config(value)
313
+ return nil if value.nil?
314
+ return value if value.is_a?(FontConfig)
315
+ return FontConfig.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
316
+
317
+ raise ArgumentError, "Expected #{FontConfig}, Hash, or nil, got #{value.class}"
318
+ end
319
+
320
+ def normalize_hierarchy(value)
321
+ return nil if value.nil?
322
+ return value if value.is_a?(Hierarchy)
323
+ return Hierarchy.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
324
+
325
+ raise ArgumentError, "Expected #{Hierarchy}, Hash, or nil, got #{value.class}"
326
+ end
327
+ end
328
+
329
+ # Image extraction configuration
330
+ #
331
+ # @example
332
+ # image = ImageExtraction.new(extract_images: true, target_dpi: 300)
333
+ #
334
+ # @example With auto-adjust DPI
335
+ # image = ImageExtraction.new(
336
+ # extract_images: true,
337
+ # auto_adjust_dpi: true,
338
+ # min_dpi: 150,
339
+ # max_dpi: 600
340
+ # )
341
+ #
342
+ class ImageExtraction
343
+ attr_reader :extract_images, :target_dpi, :max_image_dimension,
344
+ :auto_adjust_dpi, :min_dpi, :max_dpi
345
+
346
+ def initialize(
347
+ extract_images: true,
348
+ target_dpi: 300,
349
+ max_image_dimension: 2000,
350
+ auto_adjust_dpi: true,
351
+ min_dpi: 150,
352
+ max_dpi: 600
353
+ )
354
+ @extract_images = extract_images ? true : false
355
+ @target_dpi = target_dpi.to_i
356
+ @max_image_dimension = max_image_dimension.to_i
357
+ @auto_adjust_dpi = auto_adjust_dpi ? true : false
358
+ @min_dpi = min_dpi.to_i
359
+ @max_dpi = max_dpi.to_i
360
+ end
361
+
362
+ def to_h
363
+ {
364
+ extract_images: @extract_images,
365
+ target_dpi: @target_dpi,
366
+ max_image_dimension: @max_image_dimension,
367
+ auto_adjust_dpi: @auto_adjust_dpi,
368
+ min_dpi: @min_dpi,
369
+ max_dpi: @max_dpi
370
+ }
371
+ end
372
+ end
373
+
374
+ # Image preprocessing configuration for OCR
375
+ #
376
+ # @example Basic preprocessing
377
+ # preprocessing = ImagePreprocessing.new(
378
+ # binarization_method: "otsu",
379
+ # denoise: true
380
+ # )
381
+ #
382
+ # @example Advanced preprocessing
383
+ # preprocessing = ImagePreprocessing.new(
384
+ # target_dpi: 600,
385
+ # auto_rotate: true,
386
+ # deskew: true,
387
+ # denoise: true,
388
+ # contrast_enhance: true,
389
+ # binarization_method: "sauvola",
390
+ # invert_colors: false
391
+ # )
392
+ #
393
+ class ImagePreprocessing
394
+ attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
395
+ :contrast_enhance, :binarization_method, :invert_colors
396
+
397
+ VALID_BINARIZATION_METHODS = %w[otsu sauvola niblack wolf bradley adaptive].freeze
398
+
399
+ def initialize(
400
+ target_dpi: 300,
401
+ auto_rotate: true,
402
+ deskew: true,
403
+ denoise: false,
404
+ contrast_enhance: true,
405
+ binarization_method: 'otsu',
406
+ invert_colors: false
407
+ )
408
+ @target_dpi = target_dpi.to_i
409
+ @auto_rotate = auto_rotate ? true : false
410
+ @deskew = deskew ? true : false
411
+ @denoise = denoise ? true : false
412
+ @contrast_enhance = contrast_enhance ? true : false
413
+ @binarization_method = binarization_method.to_s
414
+ @invert_colors = invert_colors ? true : false
415
+
416
+ # Validate binarization method
417
+ return if VALID_BINARIZATION_METHODS.include?(@binarization_method)
418
+
419
+ valid_methods = VALID_BINARIZATION_METHODS.join(', ')
420
+ raise ArgumentError,
421
+ "Invalid binarization_method: #{@binarization_method}. Valid methods are: #{valid_methods}"
422
+ end
423
+
424
+ def to_h
425
+ {
426
+ target_dpi: @target_dpi,
427
+ auto_rotate: @auto_rotate,
428
+ deskew: @deskew,
429
+ denoise: @denoise,
430
+ contrast_enhance: @contrast_enhance,
431
+ binarization_method: @binarization_method,
432
+ invert_colors: @invert_colors
433
+ }
434
+ end
435
+ end
436
+
437
+ # Token reduction configuration
438
+ #
439
+ # @example Disable token reduction
440
+ # token = TokenReduction.new(mode: "off")
441
+ #
442
+ # @example Light reduction
443
+ # token = TokenReduction.new(mode: "light", preserve_important_words: true)
444
+ #
445
+ # @example Aggressive reduction
446
+ # token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
447
+ #
448
+ class TokenReduction
449
+ attr_reader :mode, :preserve_important_words
450
+
451
+ VALID_MODES = %w[off light moderate aggressive maximum].freeze
452
+
453
+ def initialize(mode: 'off', preserve_important_words: true)
454
+ @mode = mode.to_s
455
+ @preserve_important_words = preserve_important_words ? true : false
456
+
457
+ # Validate mode against known valid modes
458
+ return if VALID_MODES.include?(@mode)
459
+
460
+ raise ArgumentError, "Invalid token reduction mode: #{@mode}. Valid modes are: #{VALID_MODES.join(', ')}"
461
+ end
462
+
463
+ def to_h
464
+ {
465
+ mode: @mode,
466
+ preserve_important_words: @preserve_important_words
467
+ }
468
+ end
469
+ end
470
+
471
+ # HTML preprocessing configuration for content extraction
472
+ class HtmlPreprocessing
473
+ attr_reader :enabled, :preset, :remove_navigation, :remove_forms
474
+
475
+ def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
476
+ @enabled = boolean_or_nil(enabled)
477
+ @preset = preset&.to_sym
478
+ @remove_navigation = boolean_or_nil(remove_navigation)
479
+ @remove_forms = boolean_or_nil(remove_forms)
480
+ end
481
+
482
+ def to_h
483
+ {
484
+ enabled: @enabled,
485
+ preset: @preset,
486
+ remove_navigation: @remove_navigation,
487
+ remove_forms: @remove_forms
488
+ }.compact
489
+ end
490
+
491
+ private
492
+
493
+ def boolean_or_nil(value)
494
+ return nil if value.nil?
495
+
496
+ value ? true : false
497
+ end
498
+ end
499
+
500
+ # HTML rendering options for document conversion
501
+ class HtmlOptions
502
+ attr_reader :options
503
+
504
+ def initialize(**options)
505
+ normalized = options.transform_keys(&:to_sym)
506
+ symbol_keys = %i[
507
+ heading_style
508
+ code_block_style
509
+ highlight_style
510
+ list_indent_type
511
+ newline_style
512
+ whitespace_mode
513
+ ]
514
+ symbol_keys.each do |key|
515
+ normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
516
+ end
517
+ if normalized[:preprocessing].is_a?(Hash)
518
+ normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
519
+ end
520
+ @options = normalized
521
+ end
522
+
523
+ def to_h
524
+ @options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
525
+ end
526
+ end
527
+
528
+ # YAKE keyword extraction parameters
529
+ class KeywordYakeParams
530
+ attr_reader :window_size
531
+
532
+ def initialize(window_size: 2)
533
+ @window_size = window_size.to_i
534
+ end
535
+
536
+ def to_h
537
+ { window_size: @window_size }
538
+ end
539
+ end
540
+
541
+ # RAKE keyword extraction parameters
542
+ class KeywordRakeParams
543
+ attr_reader :min_word_length, :max_words_per_phrase
544
+
545
+ def initialize(min_word_length: 1, max_words_per_phrase: 3)
546
+ @min_word_length = min_word_length.to_i
547
+ @max_words_per_phrase = max_words_per_phrase.to_i
548
+ end
549
+
550
+ def to_h
551
+ {
552
+ min_word_length: @min_word_length,
553
+ max_words_per_phrase: @max_words_per_phrase
554
+ }
555
+ end
556
+ end
557
+
558
+ # Keyword extraction configuration for document analysis
559
+ class Keywords
560
+ attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
561
+ :language, :yake_params, :rake_params
562
+
563
+ def initialize(
564
+ algorithm: nil,
565
+ max_keywords: nil,
566
+ min_score: nil,
567
+ ngram_range: nil,
568
+ language: nil,
569
+ yake_params: nil,
570
+ rake_params: nil
571
+ )
572
+ @algorithm = algorithm&.to_s
573
+ @max_keywords = max_keywords&.to_i
574
+ @min_score = min_score&.to_f
575
+ @ngram_range = ngram_range&.map(&:to_i)
576
+ @language = language&.to_s
577
+ @yake_params = normalize_nested(yake_params, KeywordYakeParams)
578
+ @rake_params = normalize_nested(rake_params, KeywordRakeParams)
579
+ end
580
+
581
+ def to_h
582
+ {
583
+ algorithm: @algorithm,
584
+ max_keywords: @max_keywords,
585
+ min_score: @min_score,
586
+ ngram_range: @ngram_range,
587
+ language: @language,
588
+ yake_params: @yake_params&.to_h,
589
+ rake_params: @rake_params&.to_h
590
+ }.compact
591
+ end
592
+
593
+ private
594
+
595
+ def normalize_nested(value, klass)
596
+ return nil if value.nil?
597
+ return value if value.is_a?(klass)
598
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
599
+
600
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
601
+ end
602
+ end
603
+
604
+ # Page tracking configuration for multi-page documents
605
+ #
606
+ # @example Enable page extraction
607
+ # pages = PageConfig.new(extract_pages: true)
608
+ #
609
+ # @example Enable page markers in content
610
+ # pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
611
+ #
612
+ class PageConfig
613
+ attr_reader :extract_pages, :insert_page_markers, :marker_format
614
+
615
+ def initialize(
616
+ extract_pages: false,
617
+ insert_page_markers: false,
618
+ marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
619
+ )
620
+ @extract_pages = extract_pages ? true : false
621
+ @insert_page_markers = insert_page_markers ? true : false
622
+ @marker_format = marker_format.to_s
623
+ end
624
+
625
+ def to_h
626
+ {
627
+ extract_pages: @extract_pages,
628
+ insert_page_markers: @insert_page_markers,
629
+ marker_format: @marker_format
630
+ }
631
+ end
632
+ end
633
+
634
+ # Post-processor configuration
635
+ #
636
+ # @example Enable all post-processors
637
+ # postprocessor = PostProcessor.new(enabled: true)
638
+ #
639
+ # @example Enable specific processors
640
+ # postprocessor = PostProcessor.new(
641
+ # enabled: true,
642
+ # enabled_processors: ["quality", "formatting"]
643
+ # )
644
+ #
645
+ # @example Disable specific processors
646
+ # postprocessor = PostProcessor.new(
647
+ # enabled: true,
648
+ # disabled_processors: ["token_reduction"]
649
+ # )
650
+ #
651
+ class PostProcessor
652
+ attr_reader :enabled, :enabled_processors, :disabled_processors
653
+
654
+ def initialize(
655
+ enabled: true,
656
+ enabled_processors: nil,
657
+ disabled_processors: nil
658
+ )
659
+ @enabled = enabled ? true : false
660
+ @enabled_processors = enabled_processors&.map(&:to_s)
661
+ @disabled_processors = disabled_processors&.map(&:to_s)
662
+ end
663
+
664
+ def to_h
665
+ {
666
+ enabled: @enabled,
667
+ enabled_processors: @enabled_processors,
668
+ disabled_processors: @disabled_processors
669
+ }.compact
670
+ end
671
+ end
672
+
673
+ # Main extraction configuration
674
+ #
675
+ # @example Basic usage
676
+ # config = Extraction.new(use_cache: true, force_ocr: true)
677
+ #
678
+ # @example With OCR
679
+ # ocr = Config::OCR.new(backend: "tesseract", language: "eng")
680
+ # config = Extraction.new(ocr: ocr)
681
+ #
682
+ # @example With image extraction
683
+ # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
684
+ # config = Extraction.new(image_extraction: image)
685
+ #
686
+ # @example With preprocessing
687
+ # preprocessing = Config::ImagePreprocessing.new(
688
+ # binarization_method: "sauvola",
689
+ # denoise: true
690
+ # )
691
+ # config = Extraction.new(image_preprocessing: preprocessing)
692
+ #
693
+ # @example With post-processing
694
+ # postprocessor = Config::PostProcessor.new(
695
+ # enabled: true,
696
+ # enabled_processors: ["quality"]
697
+ # )
698
+ # config = Extraction.new(postprocessor: postprocessor)
699
+ #
700
+ # @example With all options
701
+ # config = Extraction.new(
702
+ # use_cache: true,
703
+ # enable_quality_processing: true,
704
+ # force_ocr: false,
705
+ # ocr: Config::OCR.new(language: "deu"),
706
+ # chunking: Config::Chunking.new(max_chars: 500),
707
+ # language_detection: Config::LanguageDetection.new(enabled: true),
708
+ # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
709
+ # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
710
+ # image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
711
+ # postprocessor: Config::PostProcessor.new(enabled: true)
712
+ # )
713
+ #
714
+ class Extraction
715
+ attr_reader :use_cache, :enable_quality_processing, :force_ocr,
716
+ :ocr, :chunking, :language_detection, :pdf_options,
717
+ :image_extraction, :image_preprocessing, :postprocessor,
718
+ :token_reduction, :keywords, :html_options, :pages,
719
+ :max_concurrent_extractions
720
+
721
+ # Load configuration from a file.
722
+ #
723
+ # Detects the file format from the extension (.toml, .yaml, .json)
724
+ # and loads the configuration accordingly.
725
+ #
726
+ # @param path [String] Path to the configuration file
727
+ # @return [Kreuzberg::Config::Extraction] Loaded configuration object
728
+ #
729
+ # @example Load from TOML
730
+ # config = Kreuzberg::Config::Extraction.from_file("config.toml")
731
+ #
732
+ # @example Load from YAML
733
+ # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
734
+ #
735
+ def self.from_file(path)
736
+ hash = Kreuzberg._config_from_file_native(path)
737
+ new(**hash.transform_keys(&:to_sym))
738
+ end
739
+
740
+ # Discover configuration file in current or parent directories.
741
+ #
742
+ # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
743
+ # directory and parent directories.
744
+ #
745
+ # @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
746
+ #
747
+ # @example
748
+ # config = Kreuzberg::Config::Extraction.discover
749
+ # if config
750
+ # # Use discovered config
751
+ # end
752
+ #
753
+ def self.discover
754
+ hash = Kreuzberg._config_discover_native
755
+ return nil if hash.nil?
756
+
757
+ new(**hash.transform_keys(&:to_sym))
758
+ end
759
+
760
+ def initialize(
761
+ use_cache: true,
762
+ enable_quality_processing: false,
763
+ force_ocr: false,
764
+ ocr: nil,
765
+ chunking: nil,
766
+ language_detection: nil,
767
+ pdf_options: nil,
768
+ image_extraction: nil,
769
+ image_preprocessing: nil,
770
+ postprocessor: nil,
771
+ token_reduction: nil,
772
+ keywords: nil,
773
+ html_options: nil,
774
+ pages: nil,
775
+ max_concurrent_extractions: nil
776
+ )
777
+ @use_cache = use_cache ? true : false
778
+ @enable_quality_processing = enable_quality_processing ? true : false
779
+ @force_ocr = force_ocr ? true : false
780
+ @ocr = normalize_config(ocr, OCR)
781
+ @chunking = normalize_config(chunking, Chunking)
782
+ @language_detection = normalize_config(language_detection, LanguageDetection)
783
+ @pdf_options = normalize_config(pdf_options, PDF)
784
+ @image_extraction = normalize_config(image_extraction, ImageExtraction)
785
+ @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
786
+ @postprocessor = normalize_config(postprocessor, PostProcessor)
787
+ @token_reduction = normalize_config(token_reduction, TokenReduction)
788
+ @keywords = normalize_config(keywords, Keywords)
789
+ @html_options = normalize_config(html_options, HtmlOptions)
790
+ @pages = normalize_config(pages, PageConfig)
791
+ @max_concurrent_extractions = max_concurrent_extractions&.to_i
792
+ end
793
+
794
+ # rubocop:disable Metrics/CyclomaticComplexity
795
+ def to_h
796
+ {
797
+ use_cache: @use_cache,
798
+ enable_quality_processing: @enable_quality_processing,
799
+ force_ocr: @force_ocr,
800
+ ocr: @ocr&.to_h,
801
+ chunking: @chunking&.to_h,
802
+ language_detection: @language_detection&.to_h,
803
+ pdf_options: @pdf_options&.to_h,
804
+ image_extraction: @image_extraction&.to_h,
805
+ image_preprocessing: @image_preprocessing&.to_h,
806
+ postprocessor: @postprocessor&.to_h,
807
+ token_reduction: @token_reduction&.to_h,
808
+ keywords: @keywords&.to_h,
809
+ html_options: @html_options&.to_h,
810
+ pages: @pages&.to_h,
811
+ max_concurrent_extractions: @max_concurrent_extractions
812
+ }.compact
813
+ end
814
+ # rubocop:enable Metrics/CyclomaticComplexity
815
+
816
+ # Serialize configuration to JSON string
817
+ #
818
+ # @return [String] JSON representation of the configuration
819
+ #
820
+ # @example
821
+ # config = Extraction.new(use_cache: true)
822
+ # json = config.to_json
823
+ # puts json # => "{\"use_cache\":true,...}"
824
+ #
825
+ def to_json(*_args)
826
+ json_hash = to_h
827
+ # Convert to JSON directly - the native function has issues
828
+ JSON.generate(json_hash)
829
+ end
830
+
831
+ # Get a field from the configuration
832
+ #
833
+ # Supports dot notation for nested fields (e.g., "ocr.backend")
834
+ #
835
+ # @param field_name [String, Symbol] Field name to retrieve
836
+ # @return [Object, nil] Parsed field value, or nil if field doesn't exist
837
+ #
838
+ # @example Get a top-level field
839
+ # config = Extraction.new(use_cache: true)
840
+ # config.get_field("use_cache") # => true
841
+ #
842
+ # @example Get a nested field
843
+ # config = Extraction.new(ocr: OCR.new(backend: "tesseract"))
844
+ # config.get_field("ocr.backend") # => "tesseract"
845
+ #
846
+ def get_field(field_name)
847
+ json_hash = to_h
848
+ field_path = field_name.to_s.split('.')
849
+
850
+ # Navigate the nested hash using the field path
851
+ field_path.reduce(json_hash) do |current, key|
852
+ case current
853
+ when Hash
854
+ # Check both symbol and string keys, prefer symbol if exists
855
+ if current.key?(key.to_sym)
856
+ current[key.to_sym]
857
+ elsif current.key?(key.to_s)
858
+ current[key.to_s]
859
+ end
860
+ end
861
+ end
862
+ end
863
+
864
+ # Merge another configuration into this one
865
+ #
866
+ # Returns a new configuration with fields from the other config overriding
867
+ # fields from this config (shallow merge).
868
+ #
869
+ # @param other [Extraction, Hash] Configuration to merge
870
+ # @return [Extraction] New merged configuration
871
+ #
872
+ # @example
873
+ # base = Extraction.new(use_cache: true, force_ocr: false)
874
+ # override = Extraction.new(force_ocr: true)
875
+ # merged = base.merge(override)
876
+ # merged.use_cache # => true
877
+ # merged.force_ocr # => true
878
+ #
879
+ def merge(other)
880
+ other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
881
+ # Merge the two config hashes
882
+ merged_hash = to_h.merge(other_config.to_h)
883
+ Extraction.new(**merged_hash)
884
+ end
885
+
886
+ # Merge another configuration into this one (mutating)
887
+ #
888
+ # Modifies this configuration in-place by merging fields from another config.
889
+ #
890
+ # @param other [Extraction, Hash] Configuration to merge
891
+ # @return [self]
892
+ #
893
+ # @example
894
+ # base = Extraction.new(use_cache: true, force_ocr: false)
895
+ # override = Extraction.new(force_ocr: true)
896
+ # base.merge!(override)
897
+ # base.use_cache # => true
898
+ # base.force_ocr # => true
899
+ #
900
+ def merge!(other)
901
+ other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
902
+ merged = merge(other_config)
903
+ update_from_merged(merged)
904
+ self
905
+ end
906
+
907
+ private
908
+
909
+ def normalize_config(value, klass)
910
+ return nil if value.nil?
911
+ return value if value.is_a?(klass)
912
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
913
+
914
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
915
+ end
916
+
917
+ def update_from_merged(merged)
918
+ @use_cache = merged.use_cache
919
+ @enable_quality_processing = merged.enable_quality_processing
920
+ @force_ocr = merged.force_ocr
921
+ @ocr = merged.ocr
922
+ @chunking = merged.chunking
923
+ @language_detection = merged.language_detection
924
+ @pdf_options = merged.pdf_options
925
+ @image_extraction = merged.image_extraction
926
+ @image_preprocessing = merged.image_preprocessing
927
+ @postprocessor = merged.postprocessor
928
+ @token_reduction = merged.token_reduction
929
+ @keywords = merged.keywords
930
+ @html_options = merged.html_options
931
+ @pages = merged.pages
932
+ @max_concurrent_extractions = merged.max_concurrent_extractions
933
+ end
934
+ end
935
+ end
936
+ end