kreuzberg 4.0.0.rc1 → 4.0.0.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (342) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -8
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -534
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -9
  7. data/Gemfile.lock +9 -109
  8. data/README.md +426 -421
  9. data/Rakefile +25 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -340
  12. data/ext/kreuzberg_rb/extconf.rb +45 -35
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
  15. data/ext/kreuzberg_rb/native/README.md +425 -425
  16. data/ext/kreuzberg_rb/native/build.rs +15 -17
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
  22. data/extconf.rb +28 -28
  23. data/kreuzberg.gemspec +148 -105
  24. data/lib/kreuzberg/api_proxy.rb +142 -142
  25. data/lib/kreuzberg/cache_api.rb +46 -45
  26. data/lib/kreuzberg/cli.rb +55 -55
  27. data/lib/kreuzberg/cli_proxy.rb +127 -127
  28. data/lib/kreuzberg/config.rb +691 -684
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -50
  31. data/lib/kreuzberg/extraction_api.rb +85 -84
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  35. data/lib/kreuzberg/result.rb +216 -216
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -79
  37. data/lib/kreuzberg/validator_protocol.rb +89 -89
  38. data/lib/kreuzberg/version.rb +5 -5
  39. data/lib/kreuzberg.rb +103 -82
  40. data/sig/kreuzberg/internal.rbs +184 -184
  41. data/sig/kreuzberg.rbs +520 -468
  42. data/spec/binding/cache_spec.rb +227 -227
  43. data/spec/binding/cli_proxy_spec.rb +85 -87
  44. data/spec/binding/cli_spec.rb +55 -54
  45. data/spec/binding/config_spec.rb +345 -345
  46. data/spec/binding/config_validation_spec.rb +283 -283
  47. data/spec/binding/error_handling_spec.rb +213 -213
  48. data/spec/binding/errors_spec.rb +66 -66
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  51. data/spec/binding/plugins/validator_spec.rb +274 -274
  52. data/spec/fixtures/config.toml +39 -39
  53. data/spec/fixtures/config.yaml +41 -42
  54. data/spec/fixtures/invalid_config.toml +4 -4
  55. data/spec/smoke/package_spec.rb +178 -178
  56. data/spec/spec_helper.rb +42 -42
  57. data/vendor/kreuzberg/Cargo.toml +204 -134
  58. data/vendor/kreuzberg/README.md +175 -175
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -460
  61. data/vendor/kreuzberg/src/api/error.rs +81 -81
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  64. data/vendor/kreuzberg/src/api/server.rs +353 -353
  65. data/vendor/kreuzberg/src/api/types.rs +170 -170
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -1032
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
  71. data/vendor/kreuzberg/src/core/io.rs +329 -327
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -615
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -42
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -323
  76. data/vendor/kreuzberg/src/error.rs +431 -431
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -553
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -368
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -129
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -410
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -195
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -242
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
  136. data/vendor/kreuzberg/src/lib.rs +105 -102
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -122
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -420
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -161
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -19
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  176. data/vendor/kreuzberg/src/types.rs +903 -873
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -580
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -493
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -325
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -393
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -159
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -253
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -404
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +90 -95
  331. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  332. data/spec/examples.txt +0 -104
  333. data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
  334. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
  335. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
  336. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
  337. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
  338. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
  339. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
  340. data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
  341. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
  342. data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
@@ -1,684 +1,691 @@
1
- # frozen_string_literal: true
2
-
3
- module Kreuzberg
4
- module Config
5
- # OCR configuration
6
- #
7
- # @example
8
- # ocr = OCR.new(backend: "tesseract", language: "eng")
9
- #
10
- class OCR
11
- attr_reader :backend, :language, :tesseract_config
12
-
13
- def initialize(
14
- backend: 'tesseract',
15
- language: 'eng',
16
- tesseract_config: nil
17
- )
18
- @backend = backend.to_s
19
- @language = language.to_s
20
- @tesseract_config = normalize_tesseract_config(tesseract_config)
21
- end
22
-
23
- def to_h
24
- {
25
- backend: @backend,
26
- language: @language,
27
- tesseract_config: @tesseract_config&.to_h
28
- }.compact
29
- end
30
-
31
- private
32
-
33
- def normalize_tesseract_config(value)
34
- return nil if value.nil?
35
- return value if value.is_a?(Tesseract)
36
- return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
37
-
38
- raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
39
- end
40
- end
41
-
42
- class Tesseract
43
- attr_reader :options
44
-
45
- def initialize(**options)
46
- @options = options.transform_keys(&:to_sym)
47
- normalize_nested_preprocessing!
48
- end
49
-
50
- def to_h
51
- @options.dup
52
- end
53
-
54
- private
55
-
56
- def normalize_nested_preprocessing!
57
- preprocessing = @options[:preprocessing]
58
- return if preprocessing.nil?
59
- return if preprocessing.is_a?(ImagePreprocessing)
60
- return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
61
- preprocessing.is_a?(Hash)
62
-
63
- raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
64
- end
65
- end
66
-
67
- # Chunking configuration
68
- #
69
- # @example
70
- # chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
71
- #
72
- class Chunking
73
- attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
74
-
75
- def initialize(
76
- max_chars: nil,
77
- max_overlap: nil,
78
- preset: nil,
79
- embedding: nil,
80
- chunk_size: nil,
81
- chunk_overlap: nil,
82
- enabled: true
83
- )
84
- resolved_size = chunk_size || max_chars || 1000
85
- resolved_overlap = chunk_overlap || max_overlap || 200
86
-
87
- @max_chars = resolved_size.to_i
88
- @max_overlap = resolved_overlap.to_i
89
- @preset = preset&.to_s
90
- @embedding = normalize_embedding(embedding)
91
- @enabled = boolean_or_nil(enabled)
92
- end
93
-
94
- def to_h
95
- config = {
96
- max_chars: @max_chars,
97
- max_overlap: @max_overlap,
98
- preset: @preset,
99
- embedding: @embedding&.to_h
100
- }.compact
101
- # @type var config: Hash[Symbol, untyped]
102
- config[:enabled] = @enabled unless @enabled.nil?
103
- config
104
- end
105
-
106
- private
107
-
108
- def normalize_embedding(value)
109
- return nil if value.nil?
110
- return value if value.is_a?(Embedding)
111
- return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
112
-
113
- raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
114
- end
115
-
116
- def boolean_or_nil(value)
117
- return nil if value.nil?
118
-
119
- value ? true : false
120
- end
121
- end
122
-
123
- class Embedding
124
- attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
125
-
126
- def initialize(
127
- model: { type: :preset, name: 'balanced' },
128
- normalize: true,
129
- batch_size: 32,
130
- show_download_progress: false,
131
- cache_dir: nil
132
- )
133
- @model = normalize_model(model)
134
- @normalize = boolean_or_nil(normalize)
135
- @batch_size = batch_size&.to_i
136
- @show_download_progress = boolean_or_nil(show_download_progress)
137
- @cache_dir = cache_dir&.to_s
138
- end
139
-
140
- def to_h
141
- {
142
- model: @model,
143
- normalize: @normalize,
144
- batch_size: @batch_size,
145
- show_download_progress: @show_download_progress,
146
- cache_dir: @cache_dir
147
- }.compact
148
- end
149
-
150
- private
151
-
152
- def normalize_model(model)
153
- normalized = if model.respond_to?(:to_h)
154
- model.to_h
155
- else
156
- model
157
- end
158
- raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
159
-
160
- normalized.transform_keys(&:to_sym)
161
- end
162
-
163
- def boolean_or_nil(value)
164
- return nil if value.nil?
165
-
166
- value ? true : false
167
- end
168
- end
169
-
170
- # Language detection configuration
171
- #
172
- # @example
173
- # lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
174
- #
175
- class LanguageDetection
176
- attr_reader :enabled, :min_confidence, :detect_multiple
177
-
178
- def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
179
- @enabled = enabled ? true : false
180
- @min_confidence = min_confidence.to_f
181
- @detect_multiple = detect_multiple ? true : false
182
- end
183
-
184
- def to_h
185
- {
186
- enabled: @enabled,
187
- min_confidence: @min_confidence,
188
- detect_multiple: @detect_multiple
189
- }
190
- end
191
- end
192
-
193
- # PDF-specific options
194
- #
195
- # @example
196
- # pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
197
- #
198
- class PDF
199
- attr_reader :extract_images, :passwords, :extract_metadata
200
-
201
- def initialize(
202
- extract_images: false,
203
- passwords: nil,
204
- extract_metadata: true
205
- )
206
- @extract_images = extract_images ? true : false
207
- @passwords = if passwords.is_a?(Array)
208
- passwords.map(&:to_s)
209
- else
210
- (passwords ? [passwords.to_s] : nil)
211
- end
212
- @extract_metadata = extract_metadata ? true : false
213
- end
214
-
215
- def to_h
216
- {
217
- extract_images: @extract_images,
218
- passwords: @passwords,
219
- extract_metadata: @extract_metadata
220
- }.compact
221
- end
222
- end
223
-
224
- # Image extraction configuration
225
- #
226
- # @example
227
- # image = ImageExtraction.new(extract_images: true, target_dpi: 300)
228
- #
229
- # @example With auto-adjust DPI
230
- # image = ImageExtraction.new(
231
- # extract_images: true,
232
- # auto_adjust_dpi: true,
233
- # min_dpi: 150,
234
- # max_dpi: 600
235
- # )
236
- #
237
- class ImageExtraction
238
- attr_reader :extract_images, :target_dpi, :max_image_dimension,
239
- :auto_adjust_dpi, :min_dpi, :max_dpi
240
-
241
- def initialize(
242
- extract_images: true,
243
- target_dpi: 300,
244
- max_image_dimension: 2000,
245
- auto_adjust_dpi: true,
246
- min_dpi: 150,
247
- max_dpi: 600
248
- )
249
- @extract_images = extract_images ? true : false
250
- @target_dpi = target_dpi.to_i
251
- @max_image_dimension = max_image_dimension.to_i
252
- @auto_adjust_dpi = auto_adjust_dpi ? true : false
253
- @min_dpi = min_dpi.to_i
254
- @max_dpi = max_dpi.to_i
255
- end
256
-
257
- def to_h
258
- {
259
- extract_images: @extract_images,
260
- target_dpi: @target_dpi,
261
- max_image_dimension: @max_image_dimension,
262
- auto_adjust_dpi: @auto_adjust_dpi,
263
- min_dpi: @min_dpi,
264
- max_dpi: @max_dpi
265
- }
266
- end
267
- end
268
-
269
- # Image preprocessing configuration for OCR
270
- #
271
- # @example Basic preprocessing
272
- # preprocessing = ImagePreprocessing.new(
273
- # binarization_method: "otsu",
274
- # denoise: true
275
- # )
276
- #
277
- # @example Advanced preprocessing
278
- # preprocessing = ImagePreprocessing.new(
279
- # target_dpi: 600,
280
- # auto_rotate: true,
281
- # deskew: true,
282
- # denoise: true,
283
- # contrast_enhance: true,
284
- # binarization_method: "sauvola",
285
- # invert_colors: false
286
- # )
287
- #
288
- class ImagePreprocessing
289
- attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
290
- :contrast_enhance, :binarization_method, :invert_colors
291
-
292
- def initialize(
293
- target_dpi: 300,
294
- auto_rotate: true,
295
- deskew: true,
296
- denoise: false,
297
- contrast_enhance: true,
298
- binarization_method: 'otsu',
299
- invert_colors: false
300
- )
301
- @target_dpi = target_dpi.to_i
302
- @auto_rotate = auto_rotate ? true : false
303
- @deskew = deskew ? true : false
304
- @denoise = denoise ? true : false
305
- @contrast_enhance = contrast_enhance ? true : false
306
- @binarization_method = binarization_method.to_s
307
- @invert_colors = invert_colors ? true : false
308
-
309
- valid_methods = %w[otsu sauvola adaptive]
310
- return if valid_methods.include?(@binarization_method)
311
-
312
- raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
313
- end
314
-
315
- def to_h
316
- {
317
- target_dpi: @target_dpi,
318
- auto_rotate: @auto_rotate,
319
- deskew: @deskew,
320
- denoise: @denoise,
321
- contrast_enhance: @contrast_enhance,
322
- binarization_method: @binarization_method,
323
- invert_colors: @invert_colors
324
- }
325
- end
326
- end
327
-
328
- # Token reduction configuration
329
- #
330
- # @example Disable token reduction
331
- # token = TokenReduction.new(mode: "off")
332
- #
333
- # @example Light reduction
334
- # token = TokenReduction.new(mode: "light", preserve_important_words: true)
335
- #
336
- # @example Aggressive reduction
337
- # token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
338
- #
339
- class TokenReduction
340
- attr_reader :mode, :preserve_important_words
341
-
342
- def initialize(mode: 'off', preserve_important_words: true)
343
- @mode = mode.to_s
344
- @preserve_important_words = preserve_important_words ? true : false
345
-
346
- valid_modes = %w[off light moderate aggressive maximum]
347
- return if valid_modes.include?(@mode)
348
-
349
- raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
350
- end
351
-
352
- def to_h
353
- {
354
- mode: @mode,
355
- preserve_important_words: @preserve_important_words
356
- }
357
- end
358
- end
359
-
360
- class HtmlPreprocessing
361
- attr_reader :enabled, :preset, :remove_navigation, :remove_forms
362
-
363
- def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
364
- @enabled = boolean_or_nil(enabled)
365
- @preset = preset&.to_sym
366
- @remove_navigation = boolean_or_nil(remove_navigation)
367
- @remove_forms = boolean_or_nil(remove_forms)
368
- end
369
-
370
- def to_h
371
- {
372
- enabled: @enabled,
373
- preset: @preset,
374
- remove_navigation: @remove_navigation,
375
- remove_forms: @remove_forms
376
- }.compact
377
- end
378
-
379
- private
380
-
381
- def boolean_or_nil(value)
382
- return nil if value.nil?
383
-
384
- value ? true : false
385
- end
386
- end
387
-
388
- class HtmlOptions
389
- attr_reader :options
390
-
391
- def initialize(**options)
392
- normalized = options.transform_keys(&:to_sym)
393
- symbol_keys = %i[
394
- heading_style
395
- code_block_style
396
- highlight_style
397
- list_indent_type
398
- newline_style
399
- whitespace_mode
400
- ]
401
- symbol_keys.each do |key|
402
- normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
403
- end
404
- if normalized[:preprocessing].is_a?(Hash)
405
- normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
406
- end
407
- @options = normalized
408
- end
409
-
410
- def to_h
411
- @options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
412
- end
413
- end
414
-
415
- class KeywordYakeParams
416
- attr_reader :window_size
417
-
418
- def initialize(window_size: 2)
419
- @window_size = window_size.to_i
420
- end
421
-
422
- def to_h
423
- { window_size: @window_size }
424
- end
425
- end
426
-
427
- class KeywordRakeParams
428
- attr_reader :min_word_length, :max_words_per_phrase
429
-
430
- def initialize(min_word_length: 1, max_words_per_phrase: 3)
431
- @min_word_length = min_word_length.to_i
432
- @max_words_per_phrase = max_words_per_phrase.to_i
433
- end
434
-
435
- def to_h
436
- {
437
- min_word_length: @min_word_length,
438
- max_words_per_phrase: @max_words_per_phrase
439
- }
440
- end
441
- end
442
-
443
- class Keywords
444
- attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
445
- :language, :yake_params, :rake_params
446
-
447
- def initialize(
448
- algorithm: nil,
449
- max_keywords: nil,
450
- min_score: nil,
451
- ngram_range: nil,
452
- language: nil,
453
- yake_params: nil,
454
- rake_params: nil
455
- )
456
- @algorithm = algorithm&.to_s
457
- @max_keywords = max_keywords&.to_i
458
- @min_score = min_score&.to_f
459
- @ngram_range = ngram_range&.map(&:to_i)
460
- @language = language&.to_s
461
- @yake_params = normalize_nested(yake_params, KeywordYakeParams)
462
- @rake_params = normalize_nested(rake_params, KeywordRakeParams)
463
- end
464
-
465
- def to_h
466
- {
467
- algorithm: @algorithm,
468
- max_keywords: @max_keywords,
469
- min_score: @min_score,
470
- ngram_range: @ngram_range,
471
- language: @language,
472
- yake_params: @yake_params&.to_h,
473
- rake_params: @rake_params&.to_h
474
- }.compact
475
- end
476
-
477
- private
478
-
479
- def normalize_nested(value, klass)
480
- return nil if value.nil?
481
- return value if value.is_a?(klass)
482
- return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
483
-
484
- raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
485
- end
486
- end
487
-
488
- # Post-processor configuration
489
- #
490
- # @example Enable all post-processors
491
- # postprocessor = PostProcessor.new(enabled: true)
492
- #
493
- # @example Enable specific processors
494
- # postprocessor = PostProcessor.new(
495
- # enabled: true,
496
- # enabled_processors: ["quality", "formatting"]
497
- # )
498
- #
499
- # @example Disable specific processors
500
- # postprocessor = PostProcessor.new(
501
- # enabled: true,
502
- # disabled_processors: ["token_reduction"]
503
- # )
504
- #
505
- class PostProcessor
506
- attr_reader :enabled, :enabled_processors, :disabled_processors
507
-
508
- def initialize(
509
- enabled: true,
510
- enabled_processors: nil,
511
- disabled_processors: nil
512
- )
513
- @enabled = enabled ? true : false
514
- @enabled_processors = enabled_processors&.map(&:to_s)
515
- @disabled_processors = disabled_processors&.map(&:to_s)
516
- end
517
-
518
- def to_h
519
- {
520
- enabled: @enabled,
521
- enabled_processors: @enabled_processors,
522
- disabled_processors: @disabled_processors
523
- }.compact
524
- end
525
- end
526
-
527
- # Main extraction configuration
528
- #
529
- # @example Basic usage
530
- # config = Extraction.new(use_cache: true, force_ocr: true)
531
- #
532
- # @example With OCR
533
- # ocr = Config::OCR.new(backend: "tesseract", language: "eng")
534
- # config = Extraction.new(ocr: ocr)
535
- #
536
- # @example With image extraction
537
- # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
538
- # config = Extraction.new(image_extraction: image)
539
- #
540
- # @example With preprocessing
541
- # preprocessing = Config::ImagePreprocessing.new(
542
- # binarization_method: "sauvola",
543
- # denoise: true
544
- # )
545
- # config = Extraction.new(image_preprocessing: preprocessing)
546
- #
547
- # @example With post-processing
548
- # postprocessor = Config::PostProcessor.new(
549
- # enabled: true,
550
- # enabled_processors: ["quality"]
551
- # )
552
- # config = Extraction.new(postprocessor: postprocessor)
553
- #
554
- # @example With all options
555
- # config = Extraction.new(
556
- # use_cache: true,
557
- # enable_quality_processing: true,
558
- # force_ocr: false,
559
- # ocr: Config::OCR.new(language: "deu"),
560
- # chunking: Config::Chunking.new(max_chars: 500),
561
- # language_detection: Config::LanguageDetection.new(enabled: true),
562
- # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
563
- # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
564
- # image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
565
- # postprocessor: Config::PostProcessor.new(enabled: true)
566
- # )
567
- #
568
- class Extraction
569
- attr_reader :use_cache, :enable_quality_processing, :force_ocr,
570
- :ocr, :chunking, :language_detection, :pdf_options,
571
- :image_extraction, :image_preprocessing, :postprocessor,
572
- :token_reduction, :keywords, :html_options,
573
- :max_concurrent_extractions
574
-
575
- # Load configuration from a file.
576
- #
577
- # Detects the file format from the extension (.toml, .yaml, .json)
578
- # and loads the configuration accordingly.
579
- #
580
- # @param path [String] Path to the configuration file
581
- # @return [Kreuzberg::Config::Extraction] Loaded configuration object
582
- #
583
- # @example Load from TOML
584
- # config = Kreuzberg::Config::Extraction.from_file("config.toml")
585
- #
586
- # @example Load from YAML
587
- # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
588
- #
589
- def self.from_file(path)
590
- hash = Kreuzberg._config_from_file_native(path)
591
- # Convert string keys to symbols for keyword arguments
592
- new(**hash.transform_keys(&:to_sym))
593
- end
594
-
595
- # Discover configuration file in current or parent directories.
596
- #
597
- # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
598
- # directory and parent directories.
599
- #
600
- # @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
601
- #
602
- # @example
603
- # config = Kreuzberg::Config::Extraction.discover
604
- # if config
605
- # # Use discovered config
606
- # end
607
- #
608
- def self.discover
609
- hash = Kreuzberg._config_discover_native
610
- return nil if hash.nil?
611
-
612
- # Convert string keys to symbols for keyword arguments
613
- new(**hash.transform_keys(&:to_sym))
614
- end
615
-
616
- def initialize(
617
- use_cache: true,
618
- enable_quality_processing: false,
619
- force_ocr: false,
620
- ocr: nil,
621
- chunking: nil,
622
- language_detection: nil,
623
- pdf_options: nil,
624
- image_extraction: nil,
625
- image_preprocessing: nil,
626
- postprocessor: nil,
627
- token_reduction: nil,
628
- keywords: nil,
629
- html_options: nil,
630
- max_concurrent_extractions: nil
631
- )
632
- @use_cache = use_cache ? true : false
633
- @enable_quality_processing = enable_quality_processing ? true : false
634
- @force_ocr = force_ocr ? true : false
635
- @ocr = normalize_config(ocr, OCR)
636
- @chunking = normalize_config(chunking, Chunking)
637
- @language_detection = normalize_config(language_detection, LanguageDetection)
638
- @pdf_options = normalize_config(pdf_options, PDF)
639
- @image_extraction = normalize_config(image_extraction, ImageExtraction)
640
- @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
641
- @postprocessor = normalize_config(postprocessor, PostProcessor)
642
- @token_reduction = normalize_config(token_reduction, TokenReduction)
643
- @keywords = normalize_config(keywords, Keywords)
644
- @html_options = normalize_config(html_options, HtmlOptions)
645
- @max_concurrent_extractions = max_concurrent_extractions&.to_i
646
- end
647
-
648
- # rubocop:disable Metrics/PerceivedComplexity
649
- def to_h
650
- {
651
- use_cache: @use_cache,
652
- enable_quality_processing: @enable_quality_processing,
653
- force_ocr: @force_ocr,
654
- ocr: @ocr&.to_h,
655
- chunking: @chunking&.to_h,
656
- language_detection: @language_detection&.to_h,
657
- pdf_options: @pdf_options&.to_h,
658
- image_extraction: @image_extraction&.to_h,
659
- image_preprocessing: @image_preprocessing&.to_h,
660
- postprocessor: @postprocessor&.to_h,
661
- token_reduction: @token_reduction&.to_h,
662
- keywords: @keywords&.to_h,
663
- html_options: @html_options&.to_h,
664
- max_concurrent_extractions: @max_concurrent_extractions
665
- }.compact
666
- end
667
- # rubocop:enable Metrics/PerceivedComplexity
668
-
669
- private
670
-
671
- def normalize_config(value, klass)
672
- return nil if value.nil?
673
- return value if value.is_a?(klass)
674
- # Convert string keys to symbols for keyword arguments
675
- return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
676
-
677
- raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
678
- end
679
- end
680
-
681
- # Backwards compatibility aliases
682
- Ocr = OCR
683
- end
684
- end
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ module Config
5
+ # OCR configuration
6
+ #
7
+ # @example
8
+ # ocr = OCR.new(backend: "tesseract", language: "eng")
9
+ #
10
+ class OCR
11
+ attr_reader :backend, :language, :tesseract_config
12
+
13
+ def initialize(
14
+ backend: 'tesseract',
15
+ language: 'eng',
16
+ tesseract_config: nil
17
+ )
18
+ @backend = backend.to_s
19
+ @language = language.to_s
20
+ @tesseract_config = normalize_tesseract_config(tesseract_config)
21
+ end
22
+
23
+ def to_h
24
+ {
25
+ backend: @backend,
26
+ language: @language,
27
+ tesseract_config: @tesseract_config&.to_h
28
+ }.compact
29
+ end
30
+
31
+ private
32
+
33
+ def normalize_tesseract_config(value)
34
+ return nil if value.nil?
35
+ return value if value.is_a?(Tesseract)
36
+ return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
37
+
38
+ raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
39
+ end
40
+ end
41
+
42
+ # Tesseract OCR engine configuration
43
+ class Tesseract
44
+ attr_reader :options
45
+
46
+ def initialize(**options)
47
+ @options = options.transform_keys(&:to_sym)
48
+ normalize_nested_preprocessing!
49
+ end
50
+
51
+ def to_h
52
+ @options.dup
53
+ end
54
+
55
+ private
56
+
57
+ def normalize_nested_preprocessing!
58
+ preprocessing = @options[:preprocessing]
59
+ return if preprocessing.nil?
60
+ return if preprocessing.is_a?(ImagePreprocessing)
61
+ return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
62
+ preprocessing.is_a?(Hash)
63
+
64
+ raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
65
+ end
66
+ end
67
+
68
+ # Chunking configuration
69
+ #
70
+ # @example
71
+ # chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
72
+ #
73
+ class Chunking
74
+ attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
75
+
76
+ def initialize(
77
+ max_chars: nil,
78
+ max_overlap: nil,
79
+ preset: nil,
80
+ embedding: nil,
81
+ chunk_size: nil,
82
+ chunk_overlap: nil,
83
+ enabled: true
84
+ )
85
+ resolved_size = chunk_size || max_chars || 1000
86
+ resolved_overlap = chunk_overlap || max_overlap || 200
87
+
88
+ @max_chars = resolved_size.to_i
89
+ @max_overlap = resolved_overlap.to_i
90
+ @preset = preset&.to_s
91
+ @embedding = normalize_embedding(embedding)
92
+ @enabled = boolean_or_nil(enabled)
93
+ end
94
+
95
+ def to_h
96
+ config = {
97
+ max_chars: @max_chars,
98
+ max_overlap: @max_overlap,
99
+ preset: @preset,
100
+ embedding: @embedding&.to_h
101
+ }.compact
102
+ # @type var config: Hash[Symbol, untyped]
103
+ config[:enabled] = @enabled unless @enabled.nil?
104
+ config
105
+ end
106
+
107
+ private
108
+
109
+ def normalize_embedding(value)
110
+ return nil if value.nil?
111
+ return value if value.is_a?(Embedding)
112
+ return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
113
+
114
+ raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
115
+ end
116
+
117
+ def boolean_or_nil(value)
118
+ return nil if value.nil?
119
+
120
+ value ? true : false
121
+ end
122
+ end
123
+
124
+ # Embedding model configuration for document chunking
125
+ class Embedding
126
+ attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
127
+
128
+ def initialize(
129
+ model: { type: :preset, name: 'balanced' },
130
+ normalize: true,
131
+ batch_size: 32,
132
+ show_download_progress: false,
133
+ cache_dir: nil
134
+ )
135
+ @model = normalize_model(model)
136
+ @normalize = boolean_or_nil(normalize)
137
+ @batch_size = batch_size&.to_i
138
+ @show_download_progress = boolean_or_nil(show_download_progress)
139
+ @cache_dir = cache_dir&.to_s
140
+ end
141
+
142
+ def to_h
143
+ {
144
+ model: @model,
145
+ normalize: @normalize,
146
+ batch_size: @batch_size,
147
+ show_download_progress: @show_download_progress,
148
+ cache_dir: @cache_dir
149
+ }.compact
150
+ end
151
+
152
+ private
153
+
154
+ def normalize_model(model)
155
+ normalized = if model.respond_to?(:to_h)
156
+ model.to_h
157
+ else
158
+ model
159
+ end
160
+ raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
161
+
162
+ normalized.transform_keys(&:to_sym)
163
+ end
164
+
165
+ def boolean_or_nil(value)
166
+ return nil if value.nil?
167
+
168
+ value ? true : false
169
+ end
170
+ end
171
+
172
+ # Language detection configuration
173
+ #
174
+ # @example
175
+ # lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
176
+ #
177
+ class LanguageDetection
178
+ attr_reader :enabled, :min_confidence, :detect_multiple
179
+
180
+ def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
181
+ @enabled = enabled ? true : false
182
+ @min_confidence = min_confidence.to_f
183
+ @detect_multiple = detect_multiple ? true : false
184
+ end
185
+
186
+ def to_h
187
+ {
188
+ enabled: @enabled,
189
+ min_confidence: @min_confidence,
190
+ detect_multiple: @detect_multiple
191
+ }
192
+ end
193
+ end
194
+
195
+ # PDF-specific options
196
+ #
197
+ # @example
198
+ # pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
199
+ #
200
+ class PDF
201
+ attr_reader :extract_images, :passwords, :extract_metadata
202
+
203
+ def initialize(
204
+ extract_images: false,
205
+ passwords: nil,
206
+ extract_metadata: true
207
+ )
208
+ @extract_images = extract_images ? true : false
209
+ @passwords = if passwords.is_a?(Array)
210
+ passwords.map(&:to_s)
211
+ else
212
+ (passwords ? [passwords.to_s] : nil)
213
+ end
214
+ @extract_metadata = extract_metadata ? true : false
215
+ end
216
+
217
+ def to_h
218
+ {
219
+ extract_images: @extract_images,
220
+ passwords: @passwords,
221
+ extract_metadata: @extract_metadata
222
+ }.compact
223
+ end
224
+ end
225
+
226
+ # Image extraction configuration
227
+ #
228
+ # @example
229
+ # image = ImageExtraction.new(extract_images: true, target_dpi: 300)
230
+ #
231
+ # @example With auto-adjust DPI
232
+ # image = ImageExtraction.new(
233
+ # extract_images: true,
234
+ # auto_adjust_dpi: true,
235
+ # min_dpi: 150,
236
+ # max_dpi: 600
237
+ # )
238
+ #
239
+ class ImageExtraction
240
+ attr_reader :extract_images, :target_dpi, :max_image_dimension,
241
+ :auto_adjust_dpi, :min_dpi, :max_dpi
242
+
243
+ def initialize(
244
+ extract_images: true,
245
+ target_dpi: 300,
246
+ max_image_dimension: 2000,
247
+ auto_adjust_dpi: true,
248
+ min_dpi: 150,
249
+ max_dpi: 600
250
+ )
251
+ @extract_images = extract_images ? true : false
252
+ @target_dpi = target_dpi.to_i
253
+ @max_image_dimension = max_image_dimension.to_i
254
+ @auto_adjust_dpi = auto_adjust_dpi ? true : false
255
+ @min_dpi = min_dpi.to_i
256
+ @max_dpi = max_dpi.to_i
257
+ end
258
+
259
+ def to_h
260
+ {
261
+ extract_images: @extract_images,
262
+ target_dpi: @target_dpi,
263
+ max_image_dimension: @max_image_dimension,
264
+ auto_adjust_dpi: @auto_adjust_dpi,
265
+ min_dpi: @min_dpi,
266
+ max_dpi: @max_dpi
267
+ }
268
+ end
269
+ end
270
+
271
+ # Image preprocessing configuration for OCR
272
+ #
273
+ # @example Basic preprocessing
274
+ # preprocessing = ImagePreprocessing.new(
275
+ # binarization_method: "otsu",
276
+ # denoise: true
277
+ # )
278
+ #
279
+ # @example Advanced preprocessing
280
+ # preprocessing = ImagePreprocessing.new(
281
+ # target_dpi: 600,
282
+ # auto_rotate: true,
283
+ # deskew: true,
284
+ # denoise: true,
285
+ # contrast_enhance: true,
286
+ # binarization_method: "sauvola",
287
+ # invert_colors: false
288
+ # )
289
+ #
290
+ class ImagePreprocessing
291
+ attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
292
+ :contrast_enhance, :binarization_method, :invert_colors
293
+
294
+ def initialize(
295
+ target_dpi: 300,
296
+ auto_rotate: true,
297
+ deskew: true,
298
+ denoise: false,
299
+ contrast_enhance: true,
300
+ binarization_method: 'otsu',
301
+ invert_colors: false
302
+ )
303
+ @target_dpi = target_dpi.to_i
304
+ @auto_rotate = auto_rotate ? true : false
305
+ @deskew = deskew ? true : false
306
+ @denoise = denoise ? true : false
307
+ @contrast_enhance = contrast_enhance ? true : false
308
+ @binarization_method = binarization_method.to_s
309
+ @invert_colors = invert_colors ? true : false
310
+
311
+ valid_methods = %w[otsu sauvola adaptive]
312
+ return if valid_methods.include?(@binarization_method)
313
+
314
+ raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
315
+ end
316
+
317
+ def to_h
318
+ {
319
+ target_dpi: @target_dpi,
320
+ auto_rotate: @auto_rotate,
321
+ deskew: @deskew,
322
+ denoise: @denoise,
323
+ contrast_enhance: @contrast_enhance,
324
+ binarization_method: @binarization_method,
325
+ invert_colors: @invert_colors
326
+ }
327
+ end
328
+ end
329
+
330
+ # Token reduction configuration
331
+ #
332
+ # @example Disable token reduction
333
+ # token = TokenReduction.new(mode: "off")
334
+ #
335
+ # @example Light reduction
336
+ # token = TokenReduction.new(mode: "light", preserve_important_words: true)
337
+ #
338
+ # @example Aggressive reduction
339
+ # token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
340
+ #
341
+ class TokenReduction
342
+ attr_reader :mode, :preserve_important_words
343
+
344
+ def initialize(mode: 'off', preserve_important_words: true)
345
+ @mode = mode.to_s
346
+ @preserve_important_words = preserve_important_words ? true : false
347
+
348
+ valid_modes = %w[off light moderate aggressive maximum]
349
+ return if valid_modes.include?(@mode)
350
+
351
+ raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
352
+ end
353
+
354
+ def to_h
355
+ {
356
+ mode: @mode,
357
+ preserve_important_words: @preserve_important_words
358
+ }
359
+ end
360
+ end
361
+
362
+ # HTML preprocessing configuration for content extraction
363
+ class HtmlPreprocessing
364
+ attr_reader :enabled, :preset, :remove_navigation, :remove_forms
365
+
366
+ def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
367
+ @enabled = boolean_or_nil(enabled)
368
+ @preset = preset&.to_sym
369
+ @remove_navigation = boolean_or_nil(remove_navigation)
370
+ @remove_forms = boolean_or_nil(remove_forms)
371
+ end
372
+
373
+ def to_h
374
+ {
375
+ enabled: @enabled,
376
+ preset: @preset,
377
+ remove_navigation: @remove_navigation,
378
+ remove_forms: @remove_forms
379
+ }.compact
380
+ end
381
+
382
+ private
383
+
384
+ def boolean_or_nil(value)
385
+ return nil if value.nil?
386
+
387
+ value ? true : false
388
+ end
389
+ end
390
+
391
+ # HTML rendering options for document conversion
392
+ class HtmlOptions
393
+ attr_reader :options
394
+
395
+ def initialize(**options)
396
+ normalized = options.transform_keys(&:to_sym)
397
+ symbol_keys = %i[
398
+ heading_style
399
+ code_block_style
400
+ highlight_style
401
+ list_indent_type
402
+ newline_style
403
+ whitespace_mode
404
+ ]
405
+ symbol_keys.each do |key|
406
+ normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
407
+ end
408
+ if normalized[:preprocessing].is_a?(Hash)
409
+ normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
410
+ end
411
+ @options = normalized
412
+ end
413
+
414
+ def to_h
415
+ @options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
416
+ end
417
+ end
418
+
419
+ # YAKE keyword extraction parameters
420
+ class KeywordYakeParams
421
+ attr_reader :window_size
422
+
423
+ def initialize(window_size: 2)
424
+ @window_size = window_size.to_i
425
+ end
426
+
427
+ def to_h
428
+ { window_size: @window_size }
429
+ end
430
+ end
431
+
432
+ # RAKE keyword extraction parameters
433
+ class KeywordRakeParams
434
+ attr_reader :min_word_length, :max_words_per_phrase
435
+
436
+ def initialize(min_word_length: 1, max_words_per_phrase: 3)
437
+ @min_word_length = min_word_length.to_i
438
+ @max_words_per_phrase = max_words_per_phrase.to_i
439
+ end
440
+
441
+ def to_h
442
+ {
443
+ min_word_length: @min_word_length,
444
+ max_words_per_phrase: @max_words_per_phrase
445
+ }
446
+ end
447
+ end
448
+
449
+ # Keyword extraction configuration for document analysis
450
+ class Keywords
451
+ attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
452
+ :language, :yake_params, :rake_params
453
+
454
+ def initialize(
455
+ algorithm: nil,
456
+ max_keywords: nil,
457
+ min_score: nil,
458
+ ngram_range: nil,
459
+ language: nil,
460
+ yake_params: nil,
461
+ rake_params: nil
462
+ )
463
+ @algorithm = algorithm&.to_s
464
+ @max_keywords = max_keywords&.to_i
465
+ @min_score = min_score&.to_f
466
+ @ngram_range = ngram_range&.map(&:to_i)
467
+ @language = language&.to_s
468
+ @yake_params = normalize_nested(yake_params, KeywordYakeParams)
469
+ @rake_params = normalize_nested(rake_params, KeywordRakeParams)
470
+ end
471
+
472
+ def to_h
473
+ {
474
+ algorithm: @algorithm,
475
+ max_keywords: @max_keywords,
476
+ min_score: @min_score,
477
+ ngram_range: @ngram_range,
478
+ language: @language,
479
+ yake_params: @yake_params&.to_h,
480
+ rake_params: @rake_params&.to_h
481
+ }.compact
482
+ end
483
+
484
+ private
485
+
486
+ def normalize_nested(value, klass)
487
+ return nil if value.nil?
488
+ return value if value.is_a?(klass)
489
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
490
+
491
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
492
+ end
493
+ end
494
+
495
+ # Post-processor configuration
496
+ #
497
+ # @example Enable all post-processors
498
+ # postprocessor = PostProcessor.new(enabled: true)
499
+ #
500
+ # @example Enable specific processors
501
+ # postprocessor = PostProcessor.new(
502
+ # enabled: true,
503
+ # enabled_processors: ["quality", "formatting"]
504
+ # )
505
+ #
506
+ # @example Disable specific processors
507
+ # postprocessor = PostProcessor.new(
508
+ # enabled: true,
509
+ # disabled_processors: ["token_reduction"]
510
+ # )
511
+ #
512
+ class PostProcessor
513
+ attr_reader :enabled, :enabled_processors, :disabled_processors
514
+
515
+ def initialize(
516
+ enabled: true,
517
+ enabled_processors: nil,
518
+ disabled_processors: nil
519
+ )
520
+ @enabled = enabled ? true : false
521
+ @enabled_processors = enabled_processors&.map(&:to_s)
522
+ @disabled_processors = disabled_processors&.map(&:to_s)
523
+ end
524
+
525
+ def to_h
526
+ {
527
+ enabled: @enabled,
528
+ enabled_processors: @enabled_processors,
529
+ disabled_processors: @disabled_processors
530
+ }.compact
531
+ end
532
+ end
533
+
534
+ # Main extraction configuration
535
+ #
536
+ # @example Basic usage
537
+ # config = Extraction.new(use_cache: true, force_ocr: true)
538
+ #
539
+ # @example With OCR
540
+ # ocr = Config::OCR.new(backend: "tesseract", language: "eng")
541
+ # config = Extraction.new(ocr: ocr)
542
+ #
543
+ # @example With image extraction
544
+ # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
545
+ # config = Extraction.new(image_extraction: image)
546
+ #
547
+ # @example With preprocessing
548
+ # preprocessing = Config::ImagePreprocessing.new(
549
+ # binarization_method: "sauvola",
550
+ # denoise: true
551
+ # )
552
+ # config = Extraction.new(image_preprocessing: preprocessing)
553
+ #
554
+ # @example With post-processing
555
+ # postprocessor = Config::PostProcessor.new(
556
+ # enabled: true,
557
+ # enabled_processors: ["quality"]
558
+ # )
559
+ # config = Extraction.new(postprocessor: postprocessor)
560
+ #
561
+ # @example With all options
562
+ # config = Extraction.new(
563
+ # use_cache: true,
564
+ # enable_quality_processing: true,
565
+ # force_ocr: false,
566
+ # ocr: Config::OCR.new(language: "deu"),
567
+ # chunking: Config::Chunking.new(max_chars: 500),
568
+ # language_detection: Config::LanguageDetection.new(enabled: true),
569
+ # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
570
+ # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
571
+ # image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
572
+ # postprocessor: Config::PostProcessor.new(enabled: true)
573
+ # )
574
+ #
575
+ class Extraction
576
+ attr_reader :use_cache, :enable_quality_processing, :force_ocr,
577
+ :ocr, :chunking, :language_detection, :pdf_options,
578
+ :image_extraction, :image_preprocessing, :postprocessor,
579
+ :token_reduction, :keywords, :html_options,
580
+ :max_concurrent_extractions
581
+
582
+ # Load configuration from a file.
583
+ #
584
+ # Detects the file format from the extension (.toml, .yaml, .json)
585
+ # and loads the configuration accordingly.
586
+ #
587
+ # @param path [String] Path to the configuration file
588
+ # @return [Kreuzberg::Config::Extraction] Loaded configuration object
589
+ #
590
+ # @example Load from TOML
591
+ # config = Kreuzberg::Config::Extraction.from_file("config.toml")
592
+ #
593
+ # @example Load from YAML
594
+ # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
595
+ #
596
+ def self.from_file(path)
597
+ hash = Kreuzberg._config_from_file_native(path)
598
+ # Convert string keys to symbols for keyword arguments
599
+ new(**hash.transform_keys(&:to_sym))
600
+ end
601
+
602
+ # Discover configuration file in current or parent directories.
603
+ #
604
+ # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
605
+ # directory and parent directories.
606
+ #
607
+ # @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
608
+ #
609
+ # @example
610
+ # config = Kreuzberg::Config::Extraction.discover
611
+ # if config
612
+ # # Use discovered config
613
+ # end
614
+ #
615
+ def self.discover
616
+ hash = Kreuzberg._config_discover_native
617
+ return nil if hash.nil?
618
+
619
+ # Convert string keys to symbols for keyword arguments
620
+ new(**hash.transform_keys(&:to_sym))
621
+ end
622
+
623
+ def initialize(
624
+ use_cache: true,
625
+ enable_quality_processing: false,
626
+ force_ocr: false,
627
+ ocr: nil,
628
+ chunking: nil,
629
+ language_detection: nil,
630
+ pdf_options: nil,
631
+ image_extraction: nil,
632
+ image_preprocessing: nil,
633
+ postprocessor: nil,
634
+ token_reduction: nil,
635
+ keywords: nil,
636
+ html_options: nil,
637
+ max_concurrent_extractions: nil
638
+ )
639
+ @use_cache = use_cache ? true : false
640
+ @enable_quality_processing = enable_quality_processing ? true : false
641
+ @force_ocr = force_ocr ? true : false
642
+ @ocr = normalize_config(ocr, OCR)
643
+ @chunking = normalize_config(chunking, Chunking)
644
+ @language_detection = normalize_config(language_detection, LanguageDetection)
645
+ @pdf_options = normalize_config(pdf_options, PDF)
646
+ @image_extraction = normalize_config(image_extraction, ImageExtraction)
647
+ @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
648
+ @postprocessor = normalize_config(postprocessor, PostProcessor)
649
+ @token_reduction = normalize_config(token_reduction, TokenReduction)
650
+ @keywords = normalize_config(keywords, Keywords)
651
+ @html_options = normalize_config(html_options, HtmlOptions)
652
+ @max_concurrent_extractions = max_concurrent_extractions&.to_i
653
+ end
654
+
655
+ # rubocop:disable Metrics/CyclomaticComplexity
656
+ def to_h
657
+ {
658
+ use_cache: @use_cache,
659
+ enable_quality_processing: @enable_quality_processing,
660
+ force_ocr: @force_ocr,
661
+ ocr: @ocr&.to_h,
662
+ chunking: @chunking&.to_h,
663
+ language_detection: @language_detection&.to_h,
664
+ pdf_options: @pdf_options&.to_h,
665
+ image_extraction: @image_extraction&.to_h,
666
+ image_preprocessing: @image_preprocessing&.to_h,
667
+ postprocessor: @postprocessor&.to_h,
668
+ token_reduction: @token_reduction&.to_h,
669
+ keywords: @keywords&.to_h,
670
+ html_options: @html_options&.to_h,
671
+ max_concurrent_extractions: @max_concurrent_extractions
672
+ }.compact
673
+ end
674
+ # rubocop:enable Metrics/CyclomaticComplexity
675
+
676
+ private
677
+
678
+ def normalize_config(value, klass)
679
+ return nil if value.nil?
680
+ return value if value.is_a?(klass)
681
+ # Convert string keys to symbols for keyword arguments
682
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
683
+
684
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
685
+ end
686
+ end
687
+
688
+ # Backwards compatibility aliases
689
+ Ocr = OCR
690
+ end
691
+ end