kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +538 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +105 -2
  8. data/README.md +454 -454
  9. data/Rakefile +33 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -341
  12. data/ext/kreuzberg_rb/extconf.rb +45 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
  14. data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
  15. data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
  23. data/extconf.rb +28 -28
  24. data/kreuzberg.gemspec +214 -214
  25. data/lib/kreuzberg/api_proxy.rb +142 -142
  26. data/lib/kreuzberg/cache_api.rb +81 -81
  27. data/lib/kreuzberg/cli.rb +55 -55
  28. data/lib/kreuzberg/cli_proxy.rb +127 -127
  29. data/lib/kreuzberg/config.rb +724 -724
  30. data/lib/kreuzberg/error_context.rb +80 -80
  31. data/lib/kreuzberg/errors.rb +118 -118
  32. data/lib/kreuzberg/extraction_api.rb +340 -340
  33. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  36. data/lib/kreuzberg/result.rb +279 -279
  37. data/lib/kreuzberg/setup_lib_path.rb +80 -80
  38. data/lib/kreuzberg/validator_protocol.rb +89 -89
  39. data/lib/kreuzberg/version.rb +5 -5
  40. data/lib/kreuzberg.rb +109 -109
  41. data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
  42. data/sig/kreuzberg/internal.rbs +184 -184
  43. data/sig/kreuzberg.rbs +546 -546
  44. data/spec/binding/cache_spec.rb +227 -227
  45. data/spec/binding/cli_proxy_spec.rb +85 -85
  46. data/spec/binding/cli_spec.rb +55 -55
  47. data/spec/binding/config_spec.rb +345 -345
  48. data/spec/binding/config_validation_spec.rb +283 -283
  49. data/spec/binding/error_handling_spec.rb +213 -213
  50. data/spec/binding/errors_spec.rb +66 -66
  51. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  52. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  53. data/spec/binding/plugins/validator_spec.rb +274 -274
  54. data/spec/fixtures/config.toml +39 -39
  55. data/spec/fixtures/config.yaml +41 -41
  56. data/spec/fixtures/invalid_config.toml +4 -4
  57. data/spec/smoke/package_spec.rb +178 -178
  58. data/spec/spec_helper.rb +42 -42
  59. data/vendor/Cargo.toml +1 -1
  60. data/vendor/kreuzberg/Cargo.toml +5 -5
  61. data/vendor/kreuzberg/README.md +230 -230
  62. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
  63. data/vendor/kreuzberg/build.rs +843 -843
  64. data/vendor/kreuzberg/src/api/error.rs +81 -81
  65. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  66. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  67. data/vendor/kreuzberg/src/api/server.rs +353 -353
  68. data/vendor/kreuzberg/src/api/types.rs +170 -170
  69. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  70. data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
  71. data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
  72. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  73. data/vendor/kreuzberg/src/core/config.rs +1080 -1080
  74. data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
  75. data/vendor/kreuzberg/src/core/io.rs +329 -329
  76. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  77. data/vendor/kreuzberg/src/core/mod.rs +47 -47
  78. data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
  79. data/vendor/kreuzberg/src/embeddings.rs +500 -500
  80. data/vendor/kreuzberg/src/error.rs +431 -431
  81. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  82. data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
  83. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  84. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  85. data/vendor/kreuzberg/src/extraction/html.rs +601 -601
  86. data/vendor/kreuzberg/src/extraction/image.rs +491 -491
  87. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
  88. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
  89. data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
  90. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  91. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  92. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  93. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  94. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
  95. data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
  96. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  97. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  98. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  99. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  100. data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
  101. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
  102. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
  103. data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
  104. data/vendor/kreuzberg/src/extractors/email.rs +157 -157
  105. data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
  106. data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
  107. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
  108. data/vendor/kreuzberg/src/extractors/html.rs +407 -407
  109. data/vendor/kreuzberg/src/extractors/image.rs +219 -219
  110. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
  112. data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
  114. data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
  115. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  116. data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
  117. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
  118. data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
  119. data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
  120. data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
  121. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
  122. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  123. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  124. data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
  125. data/vendor/kreuzberg/src/extractors/text.rs +265 -265
  126. data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
  127. data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
  128. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  129. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  130. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  131. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  132. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  133. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  134. data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
  135. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  136. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  137. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  138. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
  139. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
  140. data/vendor/kreuzberg/src/lib.rs +113 -113
  141. data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
  142. data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
  143. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  144. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  145. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  146. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  147. data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
  148. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  149. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  150. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
  151. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  152. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  153. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  154. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  155. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
  156. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
  157. data/vendor/kreuzberg/src/pdf/error.rs +130 -130
  158. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  159. data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
  160. data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
  161. data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
  162. data/vendor/kreuzberg/src/pdf/table.rs +420 -420
  163. data/vendor/kreuzberg/src/pdf/text.rs +240 -240
  164. data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
  165. data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
  166. data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
  167. data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
  168. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
  169. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  170. data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
  171. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  172. data/vendor/kreuzberg/src/text/mod.rs +25 -25
  173. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  174. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
  175. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  176. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  177. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  178. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  179. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  180. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  181. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  182. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  183. data/vendor/kreuzberg/src/types.rs +1055 -1055
  184. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  185. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  186. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  187. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  188. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  189. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  190. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  191. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  192. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  193. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  194. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  195. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  196. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  198. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  199. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  200. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  201. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  202. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  203. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  204. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  205. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  206. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  207. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  208. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  209. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  210. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  211. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  212. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  213. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  214. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  215. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  216. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  217. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  218. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  219. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  220. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  221. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  222. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  223. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  224. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  225. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  226. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  227. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  228. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  229. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  230. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  231. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  232. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  233. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  234. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  235. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  236. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  237. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  238. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  239. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  240. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  241. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  242. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  243. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  244. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  245. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  246. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  247. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  248. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  249. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  250. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  251. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  252. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  253. data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
  254. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
  255. data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
  256. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  257. data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
  258. data/vendor/kreuzberg/tests/config_features.rs +612 -612
  259. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
  260. data/vendor/kreuzberg/tests/core_integration.rs +510 -510
  261. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  262. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
  263. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  264. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  265. data/vendor/kreuzberg/tests/email_integration.rs +327 -327
  266. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  267. data/vendor/kreuzberg/tests/error_handling.rs +402 -402
  268. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  269. data/vendor/kreuzberg/tests/format_integration.rs +164 -164
  270. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  271. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  272. data/vendor/kreuzberg/tests/image_integration.rs +255 -255
  273. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  274. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  275. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  276. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  277. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  278. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  279. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  280. data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
  281. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
  282. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
  283. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
  284. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  285. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
  286. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  287. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  288. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
  289. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
  290. data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
  291. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
  292. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
  293. data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
  294. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  295. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
  296. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
  297. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
  298. data/vendor/kreuzberg/tests/security_validation.rs +416 -416
  299. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  300. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
  301. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
  302. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
  303. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  304. data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
  305. data/vendor/kreuzberg-ffi/README.md +851 -851
  306. data/vendor/kreuzberg-ffi/build.rs +176 -176
  307. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
  308. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
  309. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
  310. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
  311. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
  312. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
  313. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
  314. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
  315. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
  316. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
  317. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
  318. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
  319. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
  320. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  321. data/vendor/kreuzberg-tesseract/LICENSE +22 -22
  322. data/vendor/kreuzberg-tesseract/README.md +399 -399
  323. data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
  324. data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
  325. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
  326. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
  327. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
  328. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
  329. data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
  330. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
  331. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
  332. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
  333. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
  334. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
  335. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
  336. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
  337. data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
  338. data/vendor/rb-sys/Cargo.lock +393 -393
  339. data/vendor/rb-sys/Cargo.toml +70 -70
  340. data/vendor/rb-sys/Cargo.toml.orig +57 -57
  341. data/vendor/rb-sys/LICENSE-APACHE +190 -190
  342. data/vendor/rb-sys/LICENSE-MIT +21 -21
  343. data/vendor/rb-sys/build/features.rs +111 -111
  344. data/vendor/rb-sys/build/main.rs +286 -286
  345. data/vendor/rb-sys/build/stable_api_config.rs +155 -155
  346. data/vendor/rb-sys/build/version.rs +50 -50
  347. data/vendor/rb-sys/readme.md +36 -36
  348. data/vendor/rb-sys/src/bindings.rs +21 -21
  349. data/vendor/rb-sys/src/hidden.rs +11 -11
  350. data/vendor/rb-sys/src/lib.rs +35 -35
  351. data/vendor/rb-sys/src/macros.rs +371 -371
  352. data/vendor/rb-sys/src/memory.rs +53 -53
  353. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
  354. data/vendor/rb-sys/src/special_consts.rs +31 -31
  355. data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
  356. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
  357. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
  358. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
  359. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
  360. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
  361. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
  362. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
  363. data/vendor/rb-sys/src/stable_api.rs +260 -260
  364. data/vendor/rb-sys/src/symbol.rs +31 -31
  365. data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
  366. data/vendor/rb-sys/src/utils.rs +89 -89
  367. data/vendor/rb-sys/src/value_type.rs +7 -7
  368. metadata +73 -4
  369. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
@@ -1,724 +1,724 @@
1
- # frozen_string_literal: true
2
-
3
- module Kreuzberg
4
- module Config
5
- # OCR configuration
6
- #
7
- # @example
8
- # ocr = OCR.new(backend: "tesseract", language: "eng")
9
- #
10
- class OCR
11
- attr_reader :backend, :language, :tesseract_config
12
-
13
- def initialize(
14
- backend: 'tesseract',
15
- language: 'eng',
16
- tesseract_config: nil
17
- )
18
- @backend = backend.to_s
19
- @language = language.to_s
20
- @tesseract_config = normalize_tesseract_config(tesseract_config)
21
- end
22
-
23
- def to_h
24
- {
25
- backend: @backend,
26
- language: @language,
27
- tesseract_config: @tesseract_config&.to_h
28
- }.compact
29
- end
30
-
31
- private
32
-
33
- def normalize_tesseract_config(value)
34
- return nil if value.nil?
35
- return value if value.is_a?(Tesseract)
36
- return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
37
-
38
- raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
39
- end
40
- end
41
-
42
- # Tesseract OCR engine configuration
43
- class Tesseract
44
- attr_reader :options
45
-
46
- def initialize(**options)
47
- @options = options.transform_keys(&:to_sym)
48
- normalize_nested_preprocessing!
49
- end
50
-
51
- def to_h
52
- @options.dup
53
- end
54
-
55
- private
56
-
57
- def normalize_nested_preprocessing!
58
- preprocessing = @options[:preprocessing]
59
- return if preprocessing.nil?
60
- return if preprocessing.is_a?(ImagePreprocessing)
61
- return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
62
- preprocessing.is_a?(Hash)
63
-
64
- raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
65
- end
66
- end
67
-
68
- # Chunking configuration
69
- #
70
- # @example
71
- # chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
72
- #
73
- class Chunking
74
- attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
75
-
76
- def initialize(
77
- max_chars: nil,
78
- max_overlap: nil,
79
- preset: nil,
80
- embedding: nil,
81
- chunk_size: nil,
82
- chunk_overlap: nil,
83
- enabled: true
84
- )
85
- resolved_size = chunk_size || max_chars || 1000
86
- resolved_overlap = chunk_overlap || max_overlap || 200
87
-
88
- @max_chars = resolved_size.to_i
89
- @max_overlap = resolved_overlap.to_i
90
- @preset = preset&.to_s
91
- @embedding = normalize_embedding(embedding)
92
- @enabled = boolean_or_nil(enabled)
93
- end
94
-
95
- def to_h
96
- config = {
97
- max_chars: @max_chars,
98
- max_overlap: @max_overlap,
99
- preset: @preset,
100
- embedding: @embedding&.to_h
101
- }.compact
102
- # @type var config: Hash[Symbol, untyped]
103
- config[:enabled] = @enabled unless @enabled.nil?
104
- config
105
- end
106
-
107
- private
108
-
109
- def normalize_embedding(value)
110
- return nil if value.nil?
111
- return value if value.is_a?(Embedding)
112
- return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
113
-
114
- raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
115
- end
116
-
117
- def boolean_or_nil(value)
118
- return nil if value.nil?
119
-
120
- value ? true : false
121
- end
122
- end
123
-
124
- # Embedding model configuration for document chunking
125
- class Embedding
126
- attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
127
-
128
- def initialize(
129
- model: { type: :preset, name: 'balanced' },
130
- normalize: true,
131
- batch_size: 32,
132
- show_download_progress: false,
133
- cache_dir: nil
134
- )
135
- @model = normalize_model(model)
136
- @normalize = boolean_or_nil(normalize)
137
- @batch_size = batch_size&.to_i
138
- @show_download_progress = boolean_or_nil(show_download_progress)
139
- @cache_dir = cache_dir&.to_s
140
- end
141
-
142
- def to_h
143
- {
144
- model: @model,
145
- normalize: @normalize,
146
- batch_size: @batch_size,
147
- show_download_progress: @show_download_progress,
148
- cache_dir: @cache_dir
149
- }.compact
150
- end
151
-
152
- private
153
-
154
- def normalize_model(model)
155
- normalized = if model.respond_to?(:to_h)
156
- model.to_h
157
- else
158
- model
159
- end
160
- raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
161
-
162
- normalized.transform_keys(&:to_sym)
163
- end
164
-
165
- def boolean_or_nil(value)
166
- return nil if value.nil?
167
-
168
- value ? true : false
169
- end
170
- end
171
-
172
- # Language detection configuration
173
- #
174
- # @example
175
- # lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
176
- #
177
- class LanguageDetection
178
- attr_reader :enabled, :min_confidence, :detect_multiple
179
-
180
- def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
181
- @enabled = enabled ? true : false
182
- @min_confidence = min_confidence.to_f
183
- @detect_multiple = detect_multiple ? true : false
184
- end
185
-
186
- def to_h
187
- {
188
- enabled: @enabled,
189
- min_confidence: @min_confidence,
190
- detect_multiple: @detect_multiple
191
- }
192
- end
193
- end
194
-
195
- # PDF-specific options
196
- #
197
- # @example
198
- # pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
199
- #
200
- class PDF
201
- attr_reader :extract_images, :passwords, :extract_metadata
202
-
203
- def initialize(
204
- extract_images: false,
205
- passwords: nil,
206
- extract_metadata: true
207
- )
208
- @extract_images = extract_images ? true : false
209
- @passwords = if passwords.is_a?(Array)
210
- passwords.map(&:to_s)
211
- else
212
- (passwords ? [passwords.to_s] : nil)
213
- end
214
- @extract_metadata = extract_metadata ? true : false
215
- end
216
-
217
- def to_h
218
- {
219
- extract_images: @extract_images,
220
- passwords: @passwords,
221
- extract_metadata: @extract_metadata
222
- }.compact
223
- end
224
- end
225
-
226
- # Image extraction configuration
227
- #
228
- # @example
229
- # image = ImageExtraction.new(extract_images: true, target_dpi: 300)
230
- #
231
- # @example With auto-adjust DPI
232
- # image = ImageExtraction.new(
233
- # extract_images: true,
234
- # auto_adjust_dpi: true,
235
- # min_dpi: 150,
236
- # max_dpi: 600
237
- # )
238
- #
239
- class ImageExtraction
240
- attr_reader :extract_images, :target_dpi, :max_image_dimension,
241
- :auto_adjust_dpi, :min_dpi, :max_dpi
242
-
243
- def initialize(
244
- extract_images: true,
245
- target_dpi: 300,
246
- max_image_dimension: 2000,
247
- auto_adjust_dpi: true,
248
- min_dpi: 150,
249
- max_dpi: 600
250
- )
251
- @extract_images = extract_images ? true : false
252
- @target_dpi = target_dpi.to_i
253
- @max_image_dimension = max_image_dimension.to_i
254
- @auto_adjust_dpi = auto_adjust_dpi ? true : false
255
- @min_dpi = min_dpi.to_i
256
- @max_dpi = max_dpi.to_i
257
- end
258
-
259
- def to_h
260
- {
261
- extract_images: @extract_images,
262
- target_dpi: @target_dpi,
263
- max_image_dimension: @max_image_dimension,
264
- auto_adjust_dpi: @auto_adjust_dpi,
265
- min_dpi: @min_dpi,
266
- max_dpi: @max_dpi
267
- }
268
- end
269
- end
270
-
271
- # Image preprocessing configuration for OCR
272
- #
273
- # @example Basic preprocessing
274
- # preprocessing = ImagePreprocessing.new(
275
- # binarization_method: "otsu",
276
- # denoise: true
277
- # )
278
- #
279
- # @example Advanced preprocessing
280
- # preprocessing = ImagePreprocessing.new(
281
- # target_dpi: 600,
282
- # auto_rotate: true,
283
- # deskew: true,
284
- # denoise: true,
285
- # contrast_enhance: true,
286
- # binarization_method: "sauvola",
287
- # invert_colors: false
288
- # )
289
- #
290
- class ImagePreprocessing
291
- attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
292
- :contrast_enhance, :binarization_method, :invert_colors
293
-
294
- def initialize(
295
- target_dpi: 300,
296
- auto_rotate: true,
297
- deskew: true,
298
- denoise: false,
299
- contrast_enhance: true,
300
- binarization_method: 'otsu',
301
- invert_colors: false
302
- )
303
- @target_dpi = target_dpi.to_i
304
- @auto_rotate = auto_rotate ? true : false
305
- @deskew = deskew ? true : false
306
- @denoise = denoise ? true : false
307
- @contrast_enhance = contrast_enhance ? true : false
308
- @binarization_method = binarization_method.to_s
309
- @invert_colors = invert_colors ? true : false
310
-
311
- valid_methods = %w[otsu sauvola adaptive]
312
- return if valid_methods.include?(@binarization_method)
313
-
314
- raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
315
- end
316
-
317
- def to_h
318
- {
319
- target_dpi: @target_dpi,
320
- auto_rotate: @auto_rotate,
321
- deskew: @deskew,
322
- denoise: @denoise,
323
- contrast_enhance: @contrast_enhance,
324
- binarization_method: @binarization_method,
325
- invert_colors: @invert_colors
326
- }
327
- end
328
- end
329
-
330
- # Token reduction configuration
331
- #
332
- # @example Disable token reduction
333
- # token = TokenReduction.new(mode: "off")
334
- #
335
- # @example Light reduction
336
- # token = TokenReduction.new(mode: "light", preserve_important_words: true)
337
- #
338
- # @example Aggressive reduction
339
- # token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
340
- #
341
- class TokenReduction
342
- attr_reader :mode, :preserve_important_words
343
-
344
- def initialize(mode: 'off', preserve_important_words: true)
345
- @mode = mode.to_s
346
- @preserve_important_words = preserve_important_words ? true : false
347
-
348
- valid_modes = %w[off light moderate aggressive maximum]
349
- return if valid_modes.include?(@mode)
350
-
351
- raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
352
- end
353
-
354
- def to_h
355
- {
356
- mode: @mode,
357
- preserve_important_words: @preserve_important_words
358
- }
359
- end
360
- end
361
-
362
- # HTML preprocessing configuration for content extraction
363
- class HtmlPreprocessing
364
- attr_reader :enabled, :preset, :remove_navigation, :remove_forms
365
-
366
- def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
367
- @enabled = boolean_or_nil(enabled)
368
- @preset = preset&.to_sym
369
- @remove_navigation = boolean_or_nil(remove_navigation)
370
- @remove_forms = boolean_or_nil(remove_forms)
371
- end
372
-
373
- def to_h
374
- {
375
- enabled: @enabled,
376
- preset: @preset,
377
- remove_navigation: @remove_navigation,
378
- remove_forms: @remove_forms
379
- }.compact
380
- end
381
-
382
- private
383
-
384
- def boolean_or_nil(value)
385
- return nil if value.nil?
386
-
387
- value ? true : false
388
- end
389
- end
390
-
391
- # HTML rendering options for document conversion
392
- class HtmlOptions
393
- attr_reader :options
394
-
395
- def initialize(**options)
396
- normalized = options.transform_keys(&:to_sym)
397
- symbol_keys = %i[
398
- heading_style
399
- code_block_style
400
- highlight_style
401
- list_indent_type
402
- newline_style
403
- whitespace_mode
404
- ]
405
- symbol_keys.each do |key|
406
- normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
407
- end
408
- if normalized[:preprocessing].is_a?(Hash)
409
- normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
410
- end
411
- @options = normalized
412
- end
413
-
414
- def to_h
415
- @options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
416
- end
417
- end
418
-
419
- # YAKE keyword extraction parameters
420
- class KeywordYakeParams
421
- attr_reader :window_size
422
-
423
- def initialize(window_size: 2)
424
- @window_size = window_size.to_i
425
- end
426
-
427
- def to_h
428
- { window_size: @window_size }
429
- end
430
- end
431
-
432
- # RAKE keyword extraction parameters
433
- class KeywordRakeParams
434
- attr_reader :min_word_length, :max_words_per_phrase
435
-
436
- def initialize(min_word_length: 1, max_words_per_phrase: 3)
437
- @min_word_length = min_word_length.to_i
438
- @max_words_per_phrase = max_words_per_phrase.to_i
439
- end
440
-
441
- def to_h
442
- {
443
- min_word_length: @min_word_length,
444
- max_words_per_phrase: @max_words_per_phrase
445
- }
446
- end
447
- end
448
-
449
- # Keyword extraction configuration for document analysis
450
- class Keywords
451
- attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
452
- :language, :yake_params, :rake_params
453
-
454
- def initialize(
455
- algorithm: nil,
456
- max_keywords: nil,
457
- min_score: nil,
458
- ngram_range: nil,
459
- language: nil,
460
- yake_params: nil,
461
- rake_params: nil
462
- )
463
- @algorithm = algorithm&.to_s
464
- @max_keywords = max_keywords&.to_i
465
- @min_score = min_score&.to_f
466
- @ngram_range = ngram_range&.map(&:to_i)
467
- @language = language&.to_s
468
- @yake_params = normalize_nested(yake_params, KeywordYakeParams)
469
- @rake_params = normalize_nested(rake_params, KeywordRakeParams)
470
- end
471
-
472
- def to_h
473
- {
474
- algorithm: @algorithm,
475
- max_keywords: @max_keywords,
476
- min_score: @min_score,
477
- ngram_range: @ngram_range,
478
- language: @language,
479
- yake_params: @yake_params&.to_h,
480
- rake_params: @rake_params&.to_h
481
- }.compact
482
- end
483
-
484
- private
485
-
486
- def normalize_nested(value, klass)
487
- return nil if value.nil?
488
- return value if value.is_a?(klass)
489
- return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
490
-
491
- raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
492
- end
493
- end
494
-
495
- # Page tracking configuration for multi-page documents
496
- #
497
- # @example Enable page extraction
498
- # pages = PageConfig.new(extract_pages: true)
499
- #
500
- # @example Enable page markers in content
501
- # pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
502
- #
503
- class PageConfig
504
- attr_reader :extract_pages, :insert_page_markers, :marker_format
505
-
506
- def initialize(
507
- extract_pages: false,
508
- insert_page_markers: false,
509
- marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
510
- )
511
- @extract_pages = extract_pages ? true : false
512
- @insert_page_markers = insert_page_markers ? true : false
513
- @marker_format = marker_format.to_s
514
- end
515
-
516
- def to_h
517
- {
518
- extract_pages: @extract_pages,
519
- insert_page_markers: @insert_page_markers,
520
- marker_format: @marker_format
521
- }
522
- end
523
- end
524
-
525
- # Post-processor configuration
526
- #
527
- # @example Enable all post-processors
528
- # postprocessor = PostProcessor.new(enabled: true)
529
- #
530
- # @example Enable specific processors
531
- # postprocessor = PostProcessor.new(
532
- # enabled: true,
533
- # enabled_processors: ["quality", "formatting"]
534
- # )
535
- #
536
- # @example Disable specific processors
537
- # postprocessor = PostProcessor.new(
538
- # enabled: true,
539
- # disabled_processors: ["token_reduction"]
540
- # )
541
- #
542
- class PostProcessor
543
- attr_reader :enabled, :enabled_processors, :disabled_processors
544
-
545
- def initialize(
546
- enabled: true,
547
- enabled_processors: nil,
548
- disabled_processors: nil
549
- )
550
- @enabled = enabled ? true : false
551
- @enabled_processors = enabled_processors&.map(&:to_s)
552
- @disabled_processors = disabled_processors&.map(&:to_s)
553
- end
554
-
555
- def to_h
556
- {
557
- enabled: @enabled,
558
- enabled_processors: @enabled_processors,
559
- disabled_processors: @disabled_processors
560
- }.compact
561
- end
562
- end
563
-
564
- # Main extraction configuration
565
- #
566
- # @example Basic usage
567
- # config = Extraction.new(use_cache: true, force_ocr: true)
568
- #
569
- # @example With OCR
570
- # ocr = Config::OCR.new(backend: "tesseract", language: "eng")
571
- # config = Extraction.new(ocr: ocr)
572
- #
573
- # @example With image extraction
574
- # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
575
- # config = Extraction.new(image_extraction: image)
576
- #
577
- # @example With preprocessing
578
- # preprocessing = Config::ImagePreprocessing.new(
579
- # binarization_method: "sauvola",
580
- # denoise: true
581
- # )
582
- # config = Extraction.new(image_preprocessing: preprocessing)
583
- #
584
- # @example With post-processing
585
- # postprocessor = Config::PostProcessor.new(
586
- # enabled: true,
587
- # enabled_processors: ["quality"]
588
- # )
589
- # config = Extraction.new(postprocessor: postprocessor)
590
- #
591
- # @example With all options
592
- # config = Extraction.new(
593
- # use_cache: true,
594
- # enable_quality_processing: true,
595
- # force_ocr: false,
596
- # ocr: Config::OCR.new(language: "deu"),
597
- # chunking: Config::Chunking.new(max_chars: 500),
598
- # language_detection: Config::LanguageDetection.new(enabled: true),
599
- # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
600
- # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
601
- # image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
602
- # postprocessor: Config::PostProcessor.new(enabled: true)
603
- # )
604
- #
605
- class Extraction
606
- attr_reader :use_cache, :enable_quality_processing, :force_ocr,
607
- :ocr, :chunking, :language_detection, :pdf_options,
608
- :image_extraction, :image_preprocessing, :postprocessor,
609
- :token_reduction, :keywords, :html_options, :pages,
610
- :max_concurrent_extractions
611
-
612
- # Load configuration from a file.
613
- #
614
- # Detects the file format from the extension (.toml, .yaml, .json)
615
- # and loads the configuration accordingly.
616
- #
617
- # @param path [String] Path to the configuration file
618
- # @return [Kreuzberg::Config::Extraction] Loaded configuration object
619
- #
620
- # @example Load from TOML
621
- # config = Kreuzberg::Config::Extraction.from_file("config.toml")
622
- #
623
- # @example Load from YAML
624
- # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
625
- #
626
- def self.from_file(path)
627
- hash = Kreuzberg._config_from_file_native(path)
628
- # Convert string keys to symbols for keyword arguments
629
- new(**hash.transform_keys(&:to_sym))
630
- end
631
-
632
- # Discover configuration file in current or parent directories.
633
- #
634
- # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
635
- # directory and parent directories.
636
- #
637
- # @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
638
- #
639
- # @example
640
- # config = Kreuzberg::Config::Extraction.discover
641
- # if config
642
- # # Use discovered config
643
- # end
644
- #
645
- def self.discover
646
- hash = Kreuzberg._config_discover_native
647
- return nil if hash.nil?
648
-
649
- # Convert string keys to symbols for keyword arguments
650
- new(**hash.transform_keys(&:to_sym))
651
- end
652
-
653
- def initialize(
654
- use_cache: true,
655
- enable_quality_processing: false,
656
- force_ocr: false,
657
- ocr: nil,
658
- chunking: nil,
659
- language_detection: nil,
660
- pdf_options: nil,
661
- image_extraction: nil,
662
- image_preprocessing: nil,
663
- postprocessor: nil,
664
- token_reduction: nil,
665
- keywords: nil,
666
- html_options: nil,
667
- pages: nil,
668
- max_concurrent_extractions: nil
669
- )
670
- @use_cache = use_cache ? true : false
671
- @enable_quality_processing = enable_quality_processing ? true : false
672
- @force_ocr = force_ocr ? true : false
673
- @ocr = normalize_config(ocr, OCR)
674
- @chunking = normalize_config(chunking, Chunking)
675
- @language_detection = normalize_config(language_detection, LanguageDetection)
676
- @pdf_options = normalize_config(pdf_options, PDF)
677
- @image_extraction = normalize_config(image_extraction, ImageExtraction)
678
- @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
679
- @postprocessor = normalize_config(postprocessor, PostProcessor)
680
- @token_reduction = normalize_config(token_reduction, TokenReduction)
681
- @keywords = normalize_config(keywords, Keywords)
682
- @html_options = normalize_config(html_options, HtmlOptions)
683
- @pages = normalize_config(pages, PageConfig)
684
- @max_concurrent_extractions = max_concurrent_extractions&.to_i
685
- end
686
-
687
- # rubocop:disable Metrics/CyclomaticComplexity
688
- def to_h
689
- {
690
- use_cache: @use_cache,
691
- enable_quality_processing: @enable_quality_processing,
692
- force_ocr: @force_ocr,
693
- ocr: @ocr&.to_h,
694
- chunking: @chunking&.to_h,
695
- language_detection: @language_detection&.to_h,
696
- pdf_options: @pdf_options&.to_h,
697
- image_extraction: @image_extraction&.to_h,
698
- image_preprocessing: @image_preprocessing&.to_h,
699
- postprocessor: @postprocessor&.to_h,
700
- token_reduction: @token_reduction&.to_h,
701
- keywords: @keywords&.to_h,
702
- html_options: @html_options&.to_h,
703
- pages: @pages&.to_h,
704
- max_concurrent_extractions: @max_concurrent_extractions
705
- }.compact
706
- end
707
- # rubocop:enable Metrics/CyclomaticComplexity
708
-
709
- private
710
-
711
- def normalize_config(value, klass)
712
- return nil if value.nil?
713
- return value if value.is_a?(klass)
714
- # Convert string keys to symbols for keyword arguments
715
- return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
716
-
717
- raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
718
- end
719
- end
720
-
721
- # Backwards compatibility aliases
722
- Ocr = OCR
723
- end
724
- end
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ module Config
5
+ # OCR configuration
6
+ #
7
+ # @example
8
+ # ocr = OCR.new(backend: "tesseract", language: "eng")
9
+ #
10
+ class OCR
11
+ attr_reader :backend, :language, :tesseract_config
12
+
13
+ def initialize(
14
+ backend: 'tesseract',
15
+ language: 'eng',
16
+ tesseract_config: nil
17
+ )
18
+ @backend = backend.to_s
19
+ @language = language.to_s
20
+ @tesseract_config = normalize_tesseract_config(tesseract_config)
21
+ end
22
+
23
+ def to_h
24
+ {
25
+ backend: @backend,
26
+ language: @language,
27
+ tesseract_config: @tesseract_config&.to_h
28
+ }.compact
29
+ end
30
+
31
+ private
32
+
33
+ def normalize_tesseract_config(value)
34
+ return nil if value.nil?
35
+ return value if value.is_a?(Tesseract)
36
+ return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
37
+
38
+ raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
39
+ end
40
+ end
41
+
42
+ # Tesseract OCR engine configuration
43
+ class Tesseract
44
+ attr_reader :options
45
+
46
+ def initialize(**options)
47
+ @options = options.transform_keys(&:to_sym)
48
+ normalize_nested_preprocessing!
49
+ end
50
+
51
+ def to_h
52
+ @options.dup
53
+ end
54
+
55
+ private
56
+
57
+ def normalize_nested_preprocessing!
58
+ preprocessing = @options[:preprocessing]
59
+ return if preprocessing.nil?
60
+ return if preprocessing.is_a?(ImagePreprocessing)
61
+ return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
62
+ preprocessing.is_a?(Hash)
63
+
64
+ raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
65
+ end
66
+ end
67
+
68
+ # Chunking configuration
69
+ #
70
+ # @example
71
+ # chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
72
+ #
73
+ class Chunking
74
+ attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
75
+
76
+ def initialize(
77
+ max_chars: nil,
78
+ max_overlap: nil,
79
+ preset: nil,
80
+ embedding: nil,
81
+ chunk_size: nil,
82
+ chunk_overlap: nil,
83
+ enabled: true
84
+ )
85
+ resolved_size = chunk_size || max_chars || 1000
86
+ resolved_overlap = chunk_overlap || max_overlap || 200
87
+
88
+ @max_chars = resolved_size.to_i
89
+ @max_overlap = resolved_overlap.to_i
90
+ @preset = preset&.to_s
91
+ @embedding = normalize_embedding(embedding)
92
+ @enabled = boolean_or_nil(enabled)
93
+ end
94
+
95
+ def to_h
96
+ config = {
97
+ max_chars: @max_chars,
98
+ max_overlap: @max_overlap,
99
+ preset: @preset,
100
+ embedding: @embedding&.to_h
101
+ }.compact
102
+ # @type var config: Hash[Symbol, untyped]
103
+ config[:enabled] = @enabled unless @enabled.nil?
104
+ config
105
+ end
106
+
107
+ private
108
+
109
+ def normalize_embedding(value)
110
+ return nil if value.nil?
111
+ return value if value.is_a?(Embedding)
112
+ return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
113
+
114
+ raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
115
+ end
116
+
117
+ def boolean_or_nil(value)
118
+ return nil if value.nil?
119
+
120
+ value ? true : false
121
+ end
122
+ end
123
+
124
+ # Embedding model configuration for document chunking
125
+ class Embedding
126
+ attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
127
+
128
+ def initialize(
129
+ model: { type: :preset, name: 'balanced' },
130
+ normalize: true,
131
+ batch_size: 32,
132
+ show_download_progress: false,
133
+ cache_dir: nil
134
+ )
135
+ @model = normalize_model(model)
136
+ @normalize = boolean_or_nil(normalize)
137
+ @batch_size = batch_size&.to_i
138
+ @show_download_progress = boolean_or_nil(show_download_progress)
139
+ @cache_dir = cache_dir&.to_s
140
+ end
141
+
142
+ def to_h
143
+ {
144
+ model: @model,
145
+ normalize: @normalize,
146
+ batch_size: @batch_size,
147
+ show_download_progress: @show_download_progress,
148
+ cache_dir: @cache_dir
149
+ }.compact
150
+ end
151
+
152
+ private
153
+
154
+ def normalize_model(model)
155
+ normalized = if model.respond_to?(:to_h)
156
+ model.to_h
157
+ else
158
+ model
159
+ end
160
+ raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
161
+
162
+ normalized.transform_keys(&:to_sym)
163
+ end
164
+
165
+ def boolean_or_nil(value)
166
+ return nil if value.nil?
167
+
168
+ value ? true : false
169
+ end
170
+ end
171
+
172
+ # Language detection configuration
173
+ #
174
+ # @example
175
+ # lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
176
+ #
177
+ class LanguageDetection
178
+ attr_reader :enabled, :min_confidence, :detect_multiple
179
+
180
+ def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
181
+ @enabled = enabled ? true : false
182
+ @min_confidence = min_confidence.to_f
183
+ @detect_multiple = detect_multiple ? true : false
184
+ end
185
+
186
+ def to_h
187
+ {
188
+ enabled: @enabled,
189
+ min_confidence: @min_confidence,
190
+ detect_multiple: @detect_multiple
191
+ }
192
+ end
193
+ end
194
+
195
+ # PDF-specific options
196
+ #
197
+ # @example
198
+ # pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
199
+ #
200
+ class PDF
201
+ attr_reader :extract_images, :passwords, :extract_metadata
202
+
203
+ def initialize(
204
+ extract_images: false,
205
+ passwords: nil,
206
+ extract_metadata: true
207
+ )
208
+ @extract_images = extract_images ? true : false
209
+ @passwords = if passwords.is_a?(Array)
210
+ passwords.map(&:to_s)
211
+ else
212
+ (passwords ? [passwords.to_s] : nil)
213
+ end
214
+ @extract_metadata = extract_metadata ? true : false
215
+ end
216
+
217
+ def to_h
218
+ {
219
+ extract_images: @extract_images,
220
+ passwords: @passwords,
221
+ extract_metadata: @extract_metadata
222
+ }.compact
223
+ end
224
+ end
225
+
226
+ # Image extraction configuration
227
+ #
228
+ # @example
229
+ # image = ImageExtraction.new(extract_images: true, target_dpi: 300)
230
+ #
231
+ # @example With auto-adjust DPI
232
+ # image = ImageExtraction.new(
233
+ # extract_images: true,
234
+ # auto_adjust_dpi: true,
235
+ # min_dpi: 150,
236
+ # max_dpi: 600
237
+ # )
238
+ #
239
+ class ImageExtraction
240
+ attr_reader :extract_images, :target_dpi, :max_image_dimension,
241
+ :auto_adjust_dpi, :min_dpi, :max_dpi
242
+
243
+ def initialize(
244
+ extract_images: true,
245
+ target_dpi: 300,
246
+ max_image_dimension: 2000,
247
+ auto_adjust_dpi: true,
248
+ min_dpi: 150,
249
+ max_dpi: 600
250
+ )
251
+ @extract_images = extract_images ? true : false
252
+ @target_dpi = target_dpi.to_i
253
+ @max_image_dimension = max_image_dimension.to_i
254
+ @auto_adjust_dpi = auto_adjust_dpi ? true : false
255
+ @min_dpi = min_dpi.to_i
256
+ @max_dpi = max_dpi.to_i
257
+ end
258
+
259
+ def to_h
260
+ {
261
+ extract_images: @extract_images,
262
+ target_dpi: @target_dpi,
263
+ max_image_dimension: @max_image_dimension,
264
+ auto_adjust_dpi: @auto_adjust_dpi,
265
+ min_dpi: @min_dpi,
266
+ max_dpi: @max_dpi
267
+ }
268
+ end
269
+ end
270
+
271
+ # Image preprocessing configuration for OCR
272
+ #
273
+ # @example Basic preprocessing
274
+ # preprocessing = ImagePreprocessing.new(
275
+ # binarization_method: "otsu",
276
+ # denoise: true
277
+ # )
278
+ #
279
+ # @example Advanced preprocessing
280
+ # preprocessing = ImagePreprocessing.new(
281
+ # target_dpi: 600,
282
+ # auto_rotate: true,
283
+ # deskew: true,
284
+ # denoise: true,
285
+ # contrast_enhance: true,
286
+ # binarization_method: "sauvola",
287
+ # invert_colors: false
288
+ # )
289
+ #
290
+ class ImagePreprocessing
291
+ attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
292
+ :contrast_enhance, :binarization_method, :invert_colors
293
+
294
+ def initialize(
295
+ target_dpi: 300,
296
+ auto_rotate: true,
297
+ deskew: true,
298
+ denoise: false,
299
+ contrast_enhance: true,
300
+ binarization_method: 'otsu',
301
+ invert_colors: false
302
+ )
303
+ @target_dpi = target_dpi.to_i
304
+ @auto_rotate = auto_rotate ? true : false
305
+ @deskew = deskew ? true : false
306
+ @denoise = denoise ? true : false
307
+ @contrast_enhance = contrast_enhance ? true : false
308
+ @binarization_method = binarization_method.to_s
309
+ @invert_colors = invert_colors ? true : false
310
+
311
+ valid_methods = %w[otsu sauvola adaptive]
312
+ return if valid_methods.include?(@binarization_method)
313
+
314
+ raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
315
+ end
316
+
317
+ def to_h
318
+ {
319
+ target_dpi: @target_dpi,
320
+ auto_rotate: @auto_rotate,
321
+ deskew: @deskew,
322
+ denoise: @denoise,
323
+ contrast_enhance: @contrast_enhance,
324
+ binarization_method: @binarization_method,
325
+ invert_colors: @invert_colors
326
+ }
327
+ end
328
+ end
329
+
330
+ # Token reduction configuration
331
+ #
332
+ # @example Disable token reduction
333
+ # token = TokenReduction.new(mode: "off")
334
+ #
335
+ # @example Light reduction
336
+ # token = TokenReduction.new(mode: "light", preserve_important_words: true)
337
+ #
338
+ # @example Aggressive reduction
339
+ # token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
340
+ #
341
+ class TokenReduction
342
+ attr_reader :mode, :preserve_important_words
343
+
344
+ def initialize(mode: 'off', preserve_important_words: true)
345
+ @mode = mode.to_s
346
+ @preserve_important_words = preserve_important_words ? true : false
347
+
348
+ valid_modes = %w[off light moderate aggressive maximum]
349
+ return if valid_modes.include?(@mode)
350
+
351
+ raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
352
+ end
353
+
354
+ def to_h
355
+ {
356
+ mode: @mode,
357
+ preserve_important_words: @preserve_important_words
358
+ }
359
+ end
360
+ end
361
+
362
+ # HTML preprocessing configuration for content extraction
363
+ class HtmlPreprocessing
364
+ attr_reader :enabled, :preset, :remove_navigation, :remove_forms
365
+
366
+ def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
367
+ @enabled = boolean_or_nil(enabled)
368
+ @preset = preset&.to_sym
369
+ @remove_navigation = boolean_or_nil(remove_navigation)
370
+ @remove_forms = boolean_or_nil(remove_forms)
371
+ end
372
+
373
+ def to_h
374
+ {
375
+ enabled: @enabled,
376
+ preset: @preset,
377
+ remove_navigation: @remove_navigation,
378
+ remove_forms: @remove_forms
379
+ }.compact
380
+ end
381
+
382
+ private
383
+
384
+ def boolean_or_nil(value)
385
+ return nil if value.nil?
386
+
387
+ value ? true : false
388
+ end
389
+ end
390
+
391
+ # HTML rendering options for document conversion
392
+ class HtmlOptions
393
+ attr_reader :options
394
+
395
+ def initialize(**options)
396
+ normalized = options.transform_keys(&:to_sym)
397
+ symbol_keys = %i[
398
+ heading_style
399
+ code_block_style
400
+ highlight_style
401
+ list_indent_type
402
+ newline_style
403
+ whitespace_mode
404
+ ]
405
+ symbol_keys.each do |key|
406
+ normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
407
+ end
408
+ if normalized[:preprocessing].is_a?(Hash)
409
+ normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
410
+ end
411
+ @options = normalized
412
+ end
413
+
414
+ def to_h
415
+ @options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
416
+ end
417
+ end
418
+
419
+ # YAKE keyword extraction parameters
420
+ class KeywordYakeParams
421
+ attr_reader :window_size
422
+
423
+ def initialize(window_size: 2)
424
+ @window_size = window_size.to_i
425
+ end
426
+
427
+ def to_h
428
+ { window_size: @window_size }
429
+ end
430
+ end
431
+
432
+ # RAKE keyword extraction parameters
433
+ class KeywordRakeParams
434
+ attr_reader :min_word_length, :max_words_per_phrase
435
+
436
+ def initialize(min_word_length: 1, max_words_per_phrase: 3)
437
+ @min_word_length = min_word_length.to_i
438
+ @max_words_per_phrase = max_words_per_phrase.to_i
439
+ end
440
+
441
+ def to_h
442
+ {
443
+ min_word_length: @min_word_length,
444
+ max_words_per_phrase: @max_words_per_phrase
445
+ }
446
+ end
447
+ end
448
+
449
+ # Keyword extraction configuration for document analysis
450
+ class Keywords
451
+ attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
452
+ :language, :yake_params, :rake_params
453
+
454
+ def initialize(
455
+ algorithm: nil,
456
+ max_keywords: nil,
457
+ min_score: nil,
458
+ ngram_range: nil,
459
+ language: nil,
460
+ yake_params: nil,
461
+ rake_params: nil
462
+ )
463
+ @algorithm = algorithm&.to_s
464
+ @max_keywords = max_keywords&.to_i
465
+ @min_score = min_score&.to_f
466
+ @ngram_range = ngram_range&.map(&:to_i)
467
+ @language = language&.to_s
468
+ @yake_params = normalize_nested(yake_params, KeywordYakeParams)
469
+ @rake_params = normalize_nested(rake_params, KeywordRakeParams)
470
+ end
471
+
472
+ def to_h
473
+ {
474
+ algorithm: @algorithm,
475
+ max_keywords: @max_keywords,
476
+ min_score: @min_score,
477
+ ngram_range: @ngram_range,
478
+ language: @language,
479
+ yake_params: @yake_params&.to_h,
480
+ rake_params: @rake_params&.to_h
481
+ }.compact
482
+ end
483
+
484
+ private
485
+
486
+ def normalize_nested(value, klass)
487
+ return nil if value.nil?
488
+ return value if value.is_a?(klass)
489
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
490
+
491
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
492
+ end
493
+ end
494
+
495
+ # Page tracking configuration for multi-page documents
496
+ #
497
+ # @example Enable page extraction
498
+ # pages = PageConfig.new(extract_pages: true)
499
+ #
500
+ # @example Enable page markers in content
501
+ # pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
502
+ #
503
+ class PageConfig
504
+ attr_reader :extract_pages, :insert_page_markers, :marker_format
505
+
506
+ def initialize(
507
+ extract_pages: false,
508
+ insert_page_markers: false,
509
+ marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
510
+ )
511
+ @extract_pages = extract_pages ? true : false
512
+ @insert_page_markers = insert_page_markers ? true : false
513
+ @marker_format = marker_format.to_s
514
+ end
515
+
516
+ def to_h
517
+ {
518
+ extract_pages: @extract_pages,
519
+ insert_page_markers: @insert_page_markers,
520
+ marker_format: @marker_format
521
+ }
522
+ end
523
+ end
524
+
525
+ # Post-processor configuration
526
+ #
527
+ # @example Enable all post-processors
528
+ # postprocessor = PostProcessor.new(enabled: true)
529
+ #
530
+ # @example Enable specific processors
531
+ # postprocessor = PostProcessor.new(
532
+ # enabled: true,
533
+ # enabled_processors: ["quality", "formatting"]
534
+ # )
535
+ #
536
+ # @example Disable specific processors
537
+ # postprocessor = PostProcessor.new(
538
+ # enabled: true,
539
+ # disabled_processors: ["token_reduction"]
540
+ # )
541
+ #
542
+ class PostProcessor
543
+ attr_reader :enabled, :enabled_processors, :disabled_processors
544
+
545
+ def initialize(
546
+ enabled: true,
547
+ enabled_processors: nil,
548
+ disabled_processors: nil
549
+ )
550
+ @enabled = enabled ? true : false
551
+ @enabled_processors = enabled_processors&.map(&:to_s)
552
+ @disabled_processors = disabled_processors&.map(&:to_s)
553
+ end
554
+
555
+ def to_h
556
+ {
557
+ enabled: @enabled,
558
+ enabled_processors: @enabled_processors,
559
+ disabled_processors: @disabled_processors
560
+ }.compact
561
+ end
562
+ end
563
+
564
+ # Main extraction configuration
565
+ #
566
+ # @example Basic usage
567
+ # config = Extraction.new(use_cache: true, force_ocr: true)
568
+ #
569
+ # @example With OCR
570
+ # ocr = Config::OCR.new(backend: "tesseract", language: "eng")
571
+ # config = Extraction.new(ocr: ocr)
572
+ #
573
+ # @example With image extraction
574
+ # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
575
+ # config = Extraction.new(image_extraction: image)
576
+ #
577
+ # @example With preprocessing
578
+ # preprocessing = Config::ImagePreprocessing.new(
579
+ # binarization_method: "sauvola",
580
+ # denoise: true
581
+ # )
582
+ # config = Extraction.new(image_preprocessing: preprocessing)
583
+ #
584
+ # @example With post-processing
585
+ # postprocessor = Config::PostProcessor.new(
586
+ # enabled: true,
587
+ # enabled_processors: ["quality"]
588
+ # )
589
+ # config = Extraction.new(postprocessor: postprocessor)
590
+ #
591
+ # @example With all options
592
+ # config = Extraction.new(
593
+ # use_cache: true,
594
+ # enable_quality_processing: true,
595
+ # force_ocr: false,
596
+ # ocr: Config::OCR.new(language: "deu"),
597
+ # chunking: Config::Chunking.new(max_chars: 500),
598
+ # language_detection: Config::LanguageDetection.new(enabled: true),
599
+ # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
600
+ # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
601
+ # image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
602
+ # postprocessor: Config::PostProcessor.new(enabled: true)
603
+ # )
604
+ #
605
+ class Extraction
606
+ attr_reader :use_cache, :enable_quality_processing, :force_ocr,
607
+ :ocr, :chunking, :language_detection, :pdf_options,
608
+ :image_extraction, :image_preprocessing, :postprocessor,
609
+ :token_reduction, :keywords, :html_options, :pages,
610
+ :max_concurrent_extractions
611
+
612
+ # Load configuration from a file.
613
+ #
614
+ # Detects the file format from the extension (.toml, .yaml, .json)
615
+ # and loads the configuration accordingly.
616
+ #
617
+ # @param path [String] Path to the configuration file
618
+ # @return [Kreuzberg::Config::Extraction] Loaded configuration object
619
+ #
620
+ # @example Load from TOML
621
+ # config = Kreuzberg::Config::Extraction.from_file("config.toml")
622
+ #
623
+ # @example Load from YAML
624
+ # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
625
+ #
626
+ def self.from_file(path)
627
+ hash = Kreuzberg._config_from_file_native(path)
628
+ # Convert string keys to symbols for keyword arguments
629
+ new(**hash.transform_keys(&:to_sym))
630
+ end
631
+
632
+ # Discover configuration file in current or parent directories.
633
+ #
634
+ # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
635
+ # directory and parent directories.
636
+ #
637
+ # @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
638
+ #
639
+ # @example
640
+ # config = Kreuzberg::Config::Extraction.discover
641
+ # if config
642
+ # # Use discovered config
643
+ # end
644
+ #
645
+ def self.discover
646
+ hash = Kreuzberg._config_discover_native
647
+ return nil if hash.nil?
648
+
649
+ # Convert string keys to symbols for keyword arguments
650
+ new(**hash.transform_keys(&:to_sym))
651
+ end
652
+
653
+ def initialize(
654
+ use_cache: true,
655
+ enable_quality_processing: false,
656
+ force_ocr: false,
657
+ ocr: nil,
658
+ chunking: nil,
659
+ language_detection: nil,
660
+ pdf_options: nil,
661
+ image_extraction: nil,
662
+ image_preprocessing: nil,
663
+ postprocessor: nil,
664
+ token_reduction: nil,
665
+ keywords: nil,
666
+ html_options: nil,
667
+ pages: nil,
668
+ max_concurrent_extractions: nil
669
+ )
670
+ @use_cache = use_cache ? true : false
671
+ @enable_quality_processing = enable_quality_processing ? true : false
672
+ @force_ocr = force_ocr ? true : false
673
+ @ocr = normalize_config(ocr, OCR)
674
+ @chunking = normalize_config(chunking, Chunking)
675
+ @language_detection = normalize_config(language_detection, LanguageDetection)
676
+ @pdf_options = normalize_config(pdf_options, PDF)
677
+ @image_extraction = normalize_config(image_extraction, ImageExtraction)
678
+ @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
679
+ @postprocessor = normalize_config(postprocessor, PostProcessor)
680
+ @token_reduction = normalize_config(token_reduction, TokenReduction)
681
+ @keywords = normalize_config(keywords, Keywords)
682
+ @html_options = normalize_config(html_options, HtmlOptions)
683
+ @pages = normalize_config(pages, PageConfig)
684
+ @max_concurrent_extractions = max_concurrent_extractions&.to_i
685
+ end
686
+
687
+ # rubocop:disable Metrics/CyclomaticComplexity
688
+ def to_h
689
+ {
690
+ use_cache: @use_cache,
691
+ enable_quality_processing: @enable_quality_processing,
692
+ force_ocr: @force_ocr,
693
+ ocr: @ocr&.to_h,
694
+ chunking: @chunking&.to_h,
695
+ language_detection: @language_detection&.to_h,
696
+ pdf_options: @pdf_options&.to_h,
697
+ image_extraction: @image_extraction&.to_h,
698
+ image_preprocessing: @image_preprocessing&.to_h,
699
+ postprocessor: @postprocessor&.to_h,
700
+ token_reduction: @token_reduction&.to_h,
701
+ keywords: @keywords&.to_h,
702
+ html_options: @html_options&.to_h,
703
+ pages: @pages&.to_h,
704
+ max_concurrent_extractions: @max_concurrent_extractions
705
+ }.compact
706
+ end
707
+ # rubocop:enable Metrics/CyclomaticComplexity
708
+
709
+ private
710
+
711
+ def normalize_config(value, klass)
712
+ return nil if value.nil?
713
+ return value if value.is_a?(klass)
714
+ # Convert string keys to symbols for keyword arguments
715
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
716
+
717
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
718
+ end
719
+ end
720
+
721
+ # Backwards compatibility aliases
722
+ Ocr = OCR
723
+ end
724
+ end