kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +105 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +73 -4
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -1,724 +1,724 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Kreuzberg
|
|
4
|
-
module Config
|
|
5
|
-
# OCR configuration
|
|
6
|
-
#
|
|
7
|
-
# @example
|
|
8
|
-
# ocr = OCR.new(backend: "tesseract", language: "eng")
|
|
9
|
-
#
|
|
10
|
-
class OCR
|
|
11
|
-
attr_reader :backend, :language, :tesseract_config
|
|
12
|
-
|
|
13
|
-
def initialize(
|
|
14
|
-
backend: 'tesseract',
|
|
15
|
-
language: 'eng',
|
|
16
|
-
tesseract_config: nil
|
|
17
|
-
)
|
|
18
|
-
@backend = backend.to_s
|
|
19
|
-
@language = language.to_s
|
|
20
|
-
@tesseract_config = normalize_tesseract_config(tesseract_config)
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
def to_h
|
|
24
|
-
{
|
|
25
|
-
backend: @backend,
|
|
26
|
-
language: @language,
|
|
27
|
-
tesseract_config: @tesseract_config&.to_h
|
|
28
|
-
}.compact
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
private
|
|
32
|
-
|
|
33
|
-
def normalize_tesseract_config(value)
|
|
34
|
-
return nil if value.nil?
|
|
35
|
-
return value if value.is_a?(Tesseract)
|
|
36
|
-
return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
37
|
-
|
|
38
|
-
raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
|
|
39
|
-
end
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# Tesseract OCR engine configuration
|
|
43
|
-
class Tesseract
|
|
44
|
-
attr_reader :options
|
|
45
|
-
|
|
46
|
-
def initialize(**options)
|
|
47
|
-
@options = options.transform_keys(&:to_sym)
|
|
48
|
-
normalize_nested_preprocessing!
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def to_h
|
|
52
|
-
@options.dup
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
private
|
|
56
|
-
|
|
57
|
-
def normalize_nested_preprocessing!
|
|
58
|
-
preprocessing = @options[:preprocessing]
|
|
59
|
-
return if preprocessing.nil?
|
|
60
|
-
return if preprocessing.is_a?(ImagePreprocessing)
|
|
61
|
-
return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
|
|
62
|
-
preprocessing.is_a?(Hash)
|
|
63
|
-
|
|
64
|
-
raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
|
|
65
|
-
end
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
# Chunking configuration
|
|
69
|
-
#
|
|
70
|
-
# @example
|
|
71
|
-
# chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
|
|
72
|
-
#
|
|
73
|
-
class Chunking
|
|
74
|
-
attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
|
|
75
|
-
|
|
76
|
-
def initialize(
|
|
77
|
-
max_chars: nil,
|
|
78
|
-
max_overlap: nil,
|
|
79
|
-
preset: nil,
|
|
80
|
-
embedding: nil,
|
|
81
|
-
chunk_size: nil,
|
|
82
|
-
chunk_overlap: nil,
|
|
83
|
-
enabled: true
|
|
84
|
-
)
|
|
85
|
-
resolved_size = chunk_size || max_chars || 1000
|
|
86
|
-
resolved_overlap = chunk_overlap || max_overlap || 200
|
|
87
|
-
|
|
88
|
-
@max_chars = resolved_size.to_i
|
|
89
|
-
@max_overlap = resolved_overlap.to_i
|
|
90
|
-
@preset = preset&.to_s
|
|
91
|
-
@embedding = normalize_embedding(embedding)
|
|
92
|
-
@enabled = boolean_or_nil(enabled)
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
def to_h
|
|
96
|
-
config = {
|
|
97
|
-
max_chars: @max_chars,
|
|
98
|
-
max_overlap: @max_overlap,
|
|
99
|
-
preset: @preset,
|
|
100
|
-
embedding: @embedding&.to_h
|
|
101
|
-
}.compact
|
|
102
|
-
# @type var config: Hash[Symbol, untyped]
|
|
103
|
-
config[:enabled] = @enabled unless @enabled.nil?
|
|
104
|
-
config
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
private
|
|
108
|
-
|
|
109
|
-
def normalize_embedding(value)
|
|
110
|
-
return nil if value.nil?
|
|
111
|
-
return value if value.is_a?(Embedding)
|
|
112
|
-
return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
113
|
-
|
|
114
|
-
raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
def boolean_or_nil(value)
|
|
118
|
-
return nil if value.nil?
|
|
119
|
-
|
|
120
|
-
value ? true : false
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
# Embedding model configuration for document chunking
|
|
125
|
-
class Embedding
|
|
126
|
-
attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
|
|
127
|
-
|
|
128
|
-
def initialize(
|
|
129
|
-
model: { type: :preset, name: 'balanced' },
|
|
130
|
-
normalize: true,
|
|
131
|
-
batch_size: 32,
|
|
132
|
-
show_download_progress: false,
|
|
133
|
-
cache_dir: nil
|
|
134
|
-
)
|
|
135
|
-
@model = normalize_model(model)
|
|
136
|
-
@normalize = boolean_or_nil(normalize)
|
|
137
|
-
@batch_size = batch_size&.to_i
|
|
138
|
-
@show_download_progress = boolean_or_nil(show_download_progress)
|
|
139
|
-
@cache_dir = cache_dir&.to_s
|
|
140
|
-
end
|
|
141
|
-
|
|
142
|
-
def to_h
|
|
143
|
-
{
|
|
144
|
-
model: @model,
|
|
145
|
-
normalize: @normalize,
|
|
146
|
-
batch_size: @batch_size,
|
|
147
|
-
show_download_progress: @show_download_progress,
|
|
148
|
-
cache_dir: @cache_dir
|
|
149
|
-
}.compact
|
|
150
|
-
end
|
|
151
|
-
|
|
152
|
-
private
|
|
153
|
-
|
|
154
|
-
def normalize_model(model)
|
|
155
|
-
normalized = if model.respond_to?(:to_h)
|
|
156
|
-
model.to_h
|
|
157
|
-
else
|
|
158
|
-
model
|
|
159
|
-
end
|
|
160
|
-
raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
|
|
161
|
-
|
|
162
|
-
normalized.transform_keys(&:to_sym)
|
|
163
|
-
end
|
|
164
|
-
|
|
165
|
-
def boolean_or_nil(value)
|
|
166
|
-
return nil if value.nil?
|
|
167
|
-
|
|
168
|
-
value ? true : false
|
|
169
|
-
end
|
|
170
|
-
end
|
|
171
|
-
|
|
172
|
-
# Language detection configuration
|
|
173
|
-
#
|
|
174
|
-
# @example
|
|
175
|
-
# lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
|
|
176
|
-
#
|
|
177
|
-
class LanguageDetection
|
|
178
|
-
attr_reader :enabled, :min_confidence, :detect_multiple
|
|
179
|
-
|
|
180
|
-
def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
|
|
181
|
-
@enabled = enabled ? true : false
|
|
182
|
-
@min_confidence = min_confidence.to_f
|
|
183
|
-
@detect_multiple = detect_multiple ? true : false
|
|
184
|
-
end
|
|
185
|
-
|
|
186
|
-
def to_h
|
|
187
|
-
{
|
|
188
|
-
enabled: @enabled,
|
|
189
|
-
min_confidence: @min_confidence,
|
|
190
|
-
detect_multiple: @detect_multiple
|
|
191
|
-
}
|
|
192
|
-
end
|
|
193
|
-
end
|
|
194
|
-
|
|
195
|
-
# PDF-specific options
|
|
196
|
-
#
|
|
197
|
-
# @example
|
|
198
|
-
# pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
|
|
199
|
-
#
|
|
200
|
-
class PDF
|
|
201
|
-
attr_reader :extract_images, :passwords, :extract_metadata
|
|
202
|
-
|
|
203
|
-
def initialize(
|
|
204
|
-
extract_images: false,
|
|
205
|
-
passwords: nil,
|
|
206
|
-
extract_metadata: true
|
|
207
|
-
)
|
|
208
|
-
@extract_images = extract_images ? true : false
|
|
209
|
-
@passwords = if passwords.is_a?(Array)
|
|
210
|
-
passwords.map(&:to_s)
|
|
211
|
-
else
|
|
212
|
-
(passwords ? [passwords.to_s] : nil)
|
|
213
|
-
end
|
|
214
|
-
@extract_metadata = extract_metadata ? true : false
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
def to_h
|
|
218
|
-
{
|
|
219
|
-
extract_images: @extract_images,
|
|
220
|
-
passwords: @passwords,
|
|
221
|
-
extract_metadata: @extract_metadata
|
|
222
|
-
}.compact
|
|
223
|
-
end
|
|
224
|
-
end
|
|
225
|
-
|
|
226
|
-
# Image extraction configuration
|
|
227
|
-
#
|
|
228
|
-
# @example
|
|
229
|
-
# image = ImageExtraction.new(extract_images: true, target_dpi: 300)
|
|
230
|
-
#
|
|
231
|
-
# @example With auto-adjust DPI
|
|
232
|
-
# image = ImageExtraction.new(
|
|
233
|
-
# extract_images: true,
|
|
234
|
-
# auto_adjust_dpi: true,
|
|
235
|
-
# min_dpi: 150,
|
|
236
|
-
# max_dpi: 600
|
|
237
|
-
# )
|
|
238
|
-
#
|
|
239
|
-
class ImageExtraction
|
|
240
|
-
attr_reader :extract_images, :target_dpi, :max_image_dimension,
|
|
241
|
-
:auto_adjust_dpi, :min_dpi, :max_dpi
|
|
242
|
-
|
|
243
|
-
def initialize(
|
|
244
|
-
extract_images: true,
|
|
245
|
-
target_dpi: 300,
|
|
246
|
-
max_image_dimension: 2000,
|
|
247
|
-
auto_adjust_dpi: true,
|
|
248
|
-
min_dpi: 150,
|
|
249
|
-
max_dpi: 600
|
|
250
|
-
)
|
|
251
|
-
@extract_images = extract_images ? true : false
|
|
252
|
-
@target_dpi = target_dpi.to_i
|
|
253
|
-
@max_image_dimension = max_image_dimension.to_i
|
|
254
|
-
@auto_adjust_dpi = auto_adjust_dpi ? true : false
|
|
255
|
-
@min_dpi = min_dpi.to_i
|
|
256
|
-
@max_dpi = max_dpi.to_i
|
|
257
|
-
end
|
|
258
|
-
|
|
259
|
-
def to_h
|
|
260
|
-
{
|
|
261
|
-
extract_images: @extract_images,
|
|
262
|
-
target_dpi: @target_dpi,
|
|
263
|
-
max_image_dimension: @max_image_dimension,
|
|
264
|
-
auto_adjust_dpi: @auto_adjust_dpi,
|
|
265
|
-
min_dpi: @min_dpi,
|
|
266
|
-
max_dpi: @max_dpi
|
|
267
|
-
}
|
|
268
|
-
end
|
|
269
|
-
end
|
|
270
|
-
|
|
271
|
-
# Image preprocessing configuration for OCR
|
|
272
|
-
#
|
|
273
|
-
# @example Basic preprocessing
|
|
274
|
-
# preprocessing = ImagePreprocessing.new(
|
|
275
|
-
# binarization_method: "otsu",
|
|
276
|
-
# denoise: true
|
|
277
|
-
# )
|
|
278
|
-
#
|
|
279
|
-
# @example Advanced preprocessing
|
|
280
|
-
# preprocessing = ImagePreprocessing.new(
|
|
281
|
-
# target_dpi: 600,
|
|
282
|
-
# auto_rotate: true,
|
|
283
|
-
# deskew: true,
|
|
284
|
-
# denoise: true,
|
|
285
|
-
# contrast_enhance: true,
|
|
286
|
-
# binarization_method: "sauvola",
|
|
287
|
-
# invert_colors: false
|
|
288
|
-
# )
|
|
289
|
-
#
|
|
290
|
-
class ImagePreprocessing
|
|
291
|
-
attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
|
|
292
|
-
:contrast_enhance, :binarization_method, :invert_colors
|
|
293
|
-
|
|
294
|
-
def initialize(
|
|
295
|
-
target_dpi: 300,
|
|
296
|
-
auto_rotate: true,
|
|
297
|
-
deskew: true,
|
|
298
|
-
denoise: false,
|
|
299
|
-
contrast_enhance: true,
|
|
300
|
-
binarization_method: 'otsu',
|
|
301
|
-
invert_colors: false
|
|
302
|
-
)
|
|
303
|
-
@target_dpi = target_dpi.to_i
|
|
304
|
-
@auto_rotate = auto_rotate ? true : false
|
|
305
|
-
@deskew = deskew ? true : false
|
|
306
|
-
@denoise = denoise ? true : false
|
|
307
|
-
@contrast_enhance = contrast_enhance ? true : false
|
|
308
|
-
@binarization_method = binarization_method.to_s
|
|
309
|
-
@invert_colors = invert_colors ? true : false
|
|
310
|
-
|
|
311
|
-
valid_methods = %w[otsu sauvola adaptive]
|
|
312
|
-
return if valid_methods.include?(@binarization_method)
|
|
313
|
-
|
|
314
|
-
raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
|
|
315
|
-
end
|
|
316
|
-
|
|
317
|
-
def to_h
|
|
318
|
-
{
|
|
319
|
-
target_dpi: @target_dpi,
|
|
320
|
-
auto_rotate: @auto_rotate,
|
|
321
|
-
deskew: @deskew,
|
|
322
|
-
denoise: @denoise,
|
|
323
|
-
contrast_enhance: @contrast_enhance,
|
|
324
|
-
binarization_method: @binarization_method,
|
|
325
|
-
invert_colors: @invert_colors
|
|
326
|
-
}
|
|
327
|
-
end
|
|
328
|
-
end
|
|
329
|
-
|
|
330
|
-
# Token reduction configuration
|
|
331
|
-
#
|
|
332
|
-
# @example Disable token reduction
|
|
333
|
-
# token = TokenReduction.new(mode: "off")
|
|
334
|
-
#
|
|
335
|
-
# @example Light reduction
|
|
336
|
-
# token = TokenReduction.new(mode: "light", preserve_important_words: true)
|
|
337
|
-
#
|
|
338
|
-
# @example Aggressive reduction
|
|
339
|
-
# token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
|
|
340
|
-
#
|
|
341
|
-
class TokenReduction
|
|
342
|
-
attr_reader :mode, :preserve_important_words
|
|
343
|
-
|
|
344
|
-
def initialize(mode: 'off', preserve_important_words: true)
|
|
345
|
-
@mode = mode.to_s
|
|
346
|
-
@preserve_important_words = preserve_important_words ? true : false
|
|
347
|
-
|
|
348
|
-
valid_modes = %w[off light moderate aggressive maximum]
|
|
349
|
-
return if valid_modes.include?(@mode)
|
|
350
|
-
|
|
351
|
-
raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
|
|
352
|
-
end
|
|
353
|
-
|
|
354
|
-
def to_h
|
|
355
|
-
{
|
|
356
|
-
mode: @mode,
|
|
357
|
-
preserve_important_words: @preserve_important_words
|
|
358
|
-
}
|
|
359
|
-
end
|
|
360
|
-
end
|
|
361
|
-
|
|
362
|
-
# HTML preprocessing configuration for content extraction
|
|
363
|
-
class HtmlPreprocessing
|
|
364
|
-
attr_reader :enabled, :preset, :remove_navigation, :remove_forms
|
|
365
|
-
|
|
366
|
-
def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
|
|
367
|
-
@enabled = boolean_or_nil(enabled)
|
|
368
|
-
@preset = preset&.to_sym
|
|
369
|
-
@remove_navigation = boolean_or_nil(remove_navigation)
|
|
370
|
-
@remove_forms = boolean_or_nil(remove_forms)
|
|
371
|
-
end
|
|
372
|
-
|
|
373
|
-
def to_h
|
|
374
|
-
{
|
|
375
|
-
enabled: @enabled,
|
|
376
|
-
preset: @preset,
|
|
377
|
-
remove_navigation: @remove_navigation,
|
|
378
|
-
remove_forms: @remove_forms
|
|
379
|
-
}.compact
|
|
380
|
-
end
|
|
381
|
-
|
|
382
|
-
private
|
|
383
|
-
|
|
384
|
-
def boolean_or_nil(value)
|
|
385
|
-
return nil if value.nil?
|
|
386
|
-
|
|
387
|
-
value ? true : false
|
|
388
|
-
end
|
|
389
|
-
end
|
|
390
|
-
|
|
391
|
-
# HTML rendering options for document conversion
|
|
392
|
-
class HtmlOptions
|
|
393
|
-
attr_reader :options
|
|
394
|
-
|
|
395
|
-
def initialize(**options)
|
|
396
|
-
normalized = options.transform_keys(&:to_sym)
|
|
397
|
-
symbol_keys = %i[
|
|
398
|
-
heading_style
|
|
399
|
-
code_block_style
|
|
400
|
-
highlight_style
|
|
401
|
-
list_indent_type
|
|
402
|
-
newline_style
|
|
403
|
-
whitespace_mode
|
|
404
|
-
]
|
|
405
|
-
symbol_keys.each do |key|
|
|
406
|
-
normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
|
|
407
|
-
end
|
|
408
|
-
if normalized[:preprocessing].is_a?(Hash)
|
|
409
|
-
normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
|
|
410
|
-
end
|
|
411
|
-
@options = normalized
|
|
412
|
-
end
|
|
413
|
-
|
|
414
|
-
def to_h
|
|
415
|
-
@options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
|
|
416
|
-
end
|
|
417
|
-
end
|
|
418
|
-
|
|
419
|
-
# YAKE keyword extraction parameters
|
|
420
|
-
class KeywordYakeParams
|
|
421
|
-
attr_reader :window_size
|
|
422
|
-
|
|
423
|
-
def initialize(window_size: 2)
|
|
424
|
-
@window_size = window_size.to_i
|
|
425
|
-
end
|
|
426
|
-
|
|
427
|
-
def to_h
|
|
428
|
-
{ window_size: @window_size }
|
|
429
|
-
end
|
|
430
|
-
end
|
|
431
|
-
|
|
432
|
-
# RAKE keyword extraction parameters
|
|
433
|
-
class KeywordRakeParams
|
|
434
|
-
attr_reader :min_word_length, :max_words_per_phrase
|
|
435
|
-
|
|
436
|
-
def initialize(min_word_length: 1, max_words_per_phrase: 3)
|
|
437
|
-
@min_word_length = min_word_length.to_i
|
|
438
|
-
@max_words_per_phrase = max_words_per_phrase.to_i
|
|
439
|
-
end
|
|
440
|
-
|
|
441
|
-
def to_h
|
|
442
|
-
{
|
|
443
|
-
min_word_length: @min_word_length,
|
|
444
|
-
max_words_per_phrase: @max_words_per_phrase
|
|
445
|
-
}
|
|
446
|
-
end
|
|
447
|
-
end
|
|
448
|
-
|
|
449
|
-
# Keyword extraction configuration for document analysis
|
|
450
|
-
class Keywords
|
|
451
|
-
attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
|
|
452
|
-
:language, :yake_params, :rake_params
|
|
453
|
-
|
|
454
|
-
def initialize(
|
|
455
|
-
algorithm: nil,
|
|
456
|
-
max_keywords: nil,
|
|
457
|
-
min_score: nil,
|
|
458
|
-
ngram_range: nil,
|
|
459
|
-
language: nil,
|
|
460
|
-
yake_params: nil,
|
|
461
|
-
rake_params: nil
|
|
462
|
-
)
|
|
463
|
-
@algorithm = algorithm&.to_s
|
|
464
|
-
@max_keywords = max_keywords&.to_i
|
|
465
|
-
@min_score = min_score&.to_f
|
|
466
|
-
@ngram_range = ngram_range&.map(&:to_i)
|
|
467
|
-
@language = language&.to_s
|
|
468
|
-
@yake_params = normalize_nested(yake_params, KeywordYakeParams)
|
|
469
|
-
@rake_params = normalize_nested(rake_params, KeywordRakeParams)
|
|
470
|
-
end
|
|
471
|
-
|
|
472
|
-
def to_h
|
|
473
|
-
{
|
|
474
|
-
algorithm: @algorithm,
|
|
475
|
-
max_keywords: @max_keywords,
|
|
476
|
-
min_score: @min_score,
|
|
477
|
-
ngram_range: @ngram_range,
|
|
478
|
-
language: @language,
|
|
479
|
-
yake_params: @yake_params&.to_h,
|
|
480
|
-
rake_params: @rake_params&.to_h
|
|
481
|
-
}.compact
|
|
482
|
-
end
|
|
483
|
-
|
|
484
|
-
private
|
|
485
|
-
|
|
486
|
-
def normalize_nested(value, klass)
|
|
487
|
-
return nil if value.nil?
|
|
488
|
-
return value if value.is_a?(klass)
|
|
489
|
-
return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
490
|
-
|
|
491
|
-
raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
|
|
492
|
-
end
|
|
493
|
-
end
|
|
494
|
-
|
|
495
|
-
# Page tracking configuration for multi-page documents
|
|
496
|
-
#
|
|
497
|
-
# @example Enable page extraction
|
|
498
|
-
# pages = PageConfig.new(extract_pages: true)
|
|
499
|
-
#
|
|
500
|
-
# @example Enable page markers in content
|
|
501
|
-
# pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
|
|
502
|
-
#
|
|
503
|
-
class PageConfig
|
|
504
|
-
attr_reader :extract_pages, :insert_page_markers, :marker_format
|
|
505
|
-
|
|
506
|
-
def initialize(
|
|
507
|
-
extract_pages: false,
|
|
508
|
-
insert_page_markers: false,
|
|
509
|
-
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
|
|
510
|
-
)
|
|
511
|
-
@extract_pages = extract_pages ? true : false
|
|
512
|
-
@insert_page_markers = insert_page_markers ? true : false
|
|
513
|
-
@marker_format = marker_format.to_s
|
|
514
|
-
end
|
|
515
|
-
|
|
516
|
-
def to_h
|
|
517
|
-
{
|
|
518
|
-
extract_pages: @extract_pages,
|
|
519
|
-
insert_page_markers: @insert_page_markers,
|
|
520
|
-
marker_format: @marker_format
|
|
521
|
-
}
|
|
522
|
-
end
|
|
523
|
-
end
|
|
524
|
-
|
|
525
|
-
# Post-processor configuration
|
|
526
|
-
#
|
|
527
|
-
# @example Enable all post-processors
|
|
528
|
-
# postprocessor = PostProcessor.new(enabled: true)
|
|
529
|
-
#
|
|
530
|
-
# @example Enable specific processors
|
|
531
|
-
# postprocessor = PostProcessor.new(
|
|
532
|
-
# enabled: true,
|
|
533
|
-
# enabled_processors: ["quality", "formatting"]
|
|
534
|
-
# )
|
|
535
|
-
#
|
|
536
|
-
# @example Disable specific processors
|
|
537
|
-
# postprocessor = PostProcessor.new(
|
|
538
|
-
# enabled: true,
|
|
539
|
-
# disabled_processors: ["token_reduction"]
|
|
540
|
-
# )
|
|
541
|
-
#
|
|
542
|
-
class PostProcessor
|
|
543
|
-
attr_reader :enabled, :enabled_processors, :disabled_processors
|
|
544
|
-
|
|
545
|
-
def initialize(
|
|
546
|
-
enabled: true,
|
|
547
|
-
enabled_processors: nil,
|
|
548
|
-
disabled_processors: nil
|
|
549
|
-
)
|
|
550
|
-
@enabled = enabled ? true : false
|
|
551
|
-
@enabled_processors = enabled_processors&.map(&:to_s)
|
|
552
|
-
@disabled_processors = disabled_processors&.map(&:to_s)
|
|
553
|
-
end
|
|
554
|
-
|
|
555
|
-
def to_h
|
|
556
|
-
{
|
|
557
|
-
enabled: @enabled,
|
|
558
|
-
enabled_processors: @enabled_processors,
|
|
559
|
-
disabled_processors: @disabled_processors
|
|
560
|
-
}.compact
|
|
561
|
-
end
|
|
562
|
-
end
|
|
563
|
-
|
|
564
|
-
# Main extraction configuration
|
|
565
|
-
#
|
|
566
|
-
# @example Basic usage
|
|
567
|
-
# config = Extraction.new(use_cache: true, force_ocr: true)
|
|
568
|
-
#
|
|
569
|
-
# @example With OCR
|
|
570
|
-
# ocr = Config::OCR.new(backend: "tesseract", language: "eng")
|
|
571
|
-
# config = Extraction.new(ocr: ocr)
|
|
572
|
-
#
|
|
573
|
-
# @example With image extraction
|
|
574
|
-
# image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
|
|
575
|
-
# config = Extraction.new(image_extraction: image)
|
|
576
|
-
#
|
|
577
|
-
# @example With preprocessing
|
|
578
|
-
# preprocessing = Config::ImagePreprocessing.new(
|
|
579
|
-
# binarization_method: "sauvola",
|
|
580
|
-
# denoise: true
|
|
581
|
-
# )
|
|
582
|
-
# config = Extraction.new(image_preprocessing: preprocessing)
|
|
583
|
-
#
|
|
584
|
-
# @example With post-processing
|
|
585
|
-
# postprocessor = Config::PostProcessor.new(
|
|
586
|
-
# enabled: true,
|
|
587
|
-
# enabled_processors: ["quality"]
|
|
588
|
-
# )
|
|
589
|
-
# config = Extraction.new(postprocessor: postprocessor)
|
|
590
|
-
#
|
|
591
|
-
# @example With all options
|
|
592
|
-
# config = Extraction.new(
|
|
593
|
-
# use_cache: true,
|
|
594
|
-
# enable_quality_processing: true,
|
|
595
|
-
# force_ocr: false,
|
|
596
|
-
# ocr: Config::OCR.new(language: "deu"),
|
|
597
|
-
# chunking: Config::Chunking.new(max_chars: 500),
|
|
598
|
-
# language_detection: Config::LanguageDetection.new(enabled: true),
|
|
599
|
-
# pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
|
|
600
|
-
# image_extraction: Config::ImageExtraction.new(target_dpi: 600),
|
|
601
|
-
# image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
|
|
602
|
-
# postprocessor: Config::PostProcessor.new(enabled: true)
|
|
603
|
-
# )
|
|
604
|
-
#
|
|
605
|
-
class Extraction
|
|
606
|
-
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
607
|
-
:ocr, :chunking, :language_detection, :pdf_options,
|
|
608
|
-
:image_extraction, :image_preprocessing, :postprocessor,
|
|
609
|
-
:token_reduction, :keywords, :html_options, :pages,
|
|
610
|
-
:max_concurrent_extractions
|
|
611
|
-
|
|
612
|
-
# Load configuration from a file.
|
|
613
|
-
#
|
|
614
|
-
# Detects the file format from the extension (.toml, .yaml, .json)
|
|
615
|
-
# and loads the configuration accordingly.
|
|
616
|
-
#
|
|
617
|
-
# @param path [String] Path to the configuration file
|
|
618
|
-
# @return [Kreuzberg::Config::Extraction] Loaded configuration object
|
|
619
|
-
#
|
|
620
|
-
# @example Load from TOML
|
|
621
|
-
# config = Kreuzberg::Config::Extraction.from_file("config.toml")
|
|
622
|
-
#
|
|
623
|
-
# @example Load from YAML
|
|
624
|
-
# config = Kreuzberg::Config::Extraction.from_file("config.yaml")
|
|
625
|
-
#
|
|
626
|
-
def self.from_file(path)
|
|
627
|
-
hash = Kreuzberg._config_from_file_native(path)
|
|
628
|
-
# Convert string keys to symbols for keyword arguments
|
|
629
|
-
new(**hash.transform_keys(&:to_sym))
|
|
630
|
-
end
|
|
631
|
-
|
|
632
|
-
# Discover configuration file in current or parent directories.
|
|
633
|
-
#
|
|
634
|
-
# Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
|
|
635
|
-
# directory and parent directories.
|
|
636
|
-
#
|
|
637
|
-
# @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
|
|
638
|
-
#
|
|
639
|
-
# @example
|
|
640
|
-
# config = Kreuzberg::Config::Extraction.discover
|
|
641
|
-
# if config
|
|
642
|
-
# # Use discovered config
|
|
643
|
-
# end
|
|
644
|
-
#
|
|
645
|
-
def self.discover
|
|
646
|
-
hash = Kreuzberg._config_discover_native
|
|
647
|
-
return nil if hash.nil?
|
|
648
|
-
|
|
649
|
-
# Convert string keys to symbols for keyword arguments
|
|
650
|
-
new(**hash.transform_keys(&:to_sym))
|
|
651
|
-
end
|
|
652
|
-
|
|
653
|
-
def initialize(
|
|
654
|
-
use_cache: true,
|
|
655
|
-
enable_quality_processing: false,
|
|
656
|
-
force_ocr: false,
|
|
657
|
-
ocr: nil,
|
|
658
|
-
chunking: nil,
|
|
659
|
-
language_detection: nil,
|
|
660
|
-
pdf_options: nil,
|
|
661
|
-
image_extraction: nil,
|
|
662
|
-
image_preprocessing: nil,
|
|
663
|
-
postprocessor: nil,
|
|
664
|
-
token_reduction: nil,
|
|
665
|
-
keywords: nil,
|
|
666
|
-
html_options: nil,
|
|
667
|
-
pages: nil,
|
|
668
|
-
max_concurrent_extractions: nil
|
|
669
|
-
)
|
|
670
|
-
@use_cache = use_cache ? true : false
|
|
671
|
-
@enable_quality_processing = enable_quality_processing ? true : false
|
|
672
|
-
@force_ocr = force_ocr ? true : false
|
|
673
|
-
@ocr = normalize_config(ocr, OCR)
|
|
674
|
-
@chunking = normalize_config(chunking, Chunking)
|
|
675
|
-
@language_detection = normalize_config(language_detection, LanguageDetection)
|
|
676
|
-
@pdf_options = normalize_config(pdf_options, PDF)
|
|
677
|
-
@image_extraction = normalize_config(image_extraction, ImageExtraction)
|
|
678
|
-
@image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
|
|
679
|
-
@postprocessor = normalize_config(postprocessor, PostProcessor)
|
|
680
|
-
@token_reduction = normalize_config(token_reduction, TokenReduction)
|
|
681
|
-
@keywords = normalize_config(keywords, Keywords)
|
|
682
|
-
@html_options = normalize_config(html_options, HtmlOptions)
|
|
683
|
-
@pages = normalize_config(pages, PageConfig)
|
|
684
|
-
@max_concurrent_extractions = max_concurrent_extractions&.to_i
|
|
685
|
-
end
|
|
686
|
-
|
|
687
|
-
# rubocop:disable Metrics/CyclomaticComplexity
|
|
688
|
-
def to_h
|
|
689
|
-
{
|
|
690
|
-
use_cache: @use_cache,
|
|
691
|
-
enable_quality_processing: @enable_quality_processing,
|
|
692
|
-
force_ocr: @force_ocr,
|
|
693
|
-
ocr: @ocr&.to_h,
|
|
694
|
-
chunking: @chunking&.to_h,
|
|
695
|
-
language_detection: @language_detection&.to_h,
|
|
696
|
-
pdf_options: @pdf_options&.to_h,
|
|
697
|
-
image_extraction: @image_extraction&.to_h,
|
|
698
|
-
image_preprocessing: @image_preprocessing&.to_h,
|
|
699
|
-
postprocessor: @postprocessor&.to_h,
|
|
700
|
-
token_reduction: @token_reduction&.to_h,
|
|
701
|
-
keywords: @keywords&.to_h,
|
|
702
|
-
html_options: @html_options&.to_h,
|
|
703
|
-
pages: @pages&.to_h,
|
|
704
|
-
max_concurrent_extractions: @max_concurrent_extractions
|
|
705
|
-
}.compact
|
|
706
|
-
end
|
|
707
|
-
# rubocop:enable Metrics/CyclomaticComplexity
|
|
708
|
-
|
|
709
|
-
private
|
|
710
|
-
|
|
711
|
-
def normalize_config(value, klass)
|
|
712
|
-
return nil if value.nil?
|
|
713
|
-
return value if value.is_a?(klass)
|
|
714
|
-
# Convert string keys to symbols for keyword arguments
|
|
715
|
-
return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
716
|
-
|
|
717
|
-
raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
|
|
718
|
-
end
|
|
719
|
-
end
|
|
720
|
-
|
|
721
|
-
# Backwards compatibility aliases
|
|
722
|
-
Ocr = OCR
|
|
723
|
-
end
|
|
724
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
module Config
|
|
5
|
+
# OCR configuration
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# ocr = OCR.new(backend: "tesseract", language: "eng")
|
|
9
|
+
#
|
|
10
|
+
class OCR
|
|
11
|
+
attr_reader :backend, :language, :tesseract_config
|
|
12
|
+
|
|
13
|
+
def initialize(
|
|
14
|
+
backend: 'tesseract',
|
|
15
|
+
language: 'eng',
|
|
16
|
+
tesseract_config: nil
|
|
17
|
+
)
|
|
18
|
+
@backend = backend.to_s
|
|
19
|
+
@language = language.to_s
|
|
20
|
+
@tesseract_config = normalize_tesseract_config(tesseract_config)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def to_h
|
|
24
|
+
{
|
|
25
|
+
backend: @backend,
|
|
26
|
+
language: @language,
|
|
27
|
+
tesseract_config: @tesseract_config&.to_h
|
|
28
|
+
}.compact
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def normalize_tesseract_config(value)
|
|
34
|
+
return nil if value.nil?
|
|
35
|
+
return value if value.is_a?(Tesseract)
|
|
36
|
+
return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
37
|
+
|
|
38
|
+
raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Tesseract OCR engine configuration
|
|
43
|
+
class Tesseract
|
|
44
|
+
attr_reader :options
|
|
45
|
+
|
|
46
|
+
def initialize(**options)
|
|
47
|
+
@options = options.transform_keys(&:to_sym)
|
|
48
|
+
normalize_nested_preprocessing!
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def to_h
|
|
52
|
+
@options.dup
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def normalize_nested_preprocessing!
|
|
58
|
+
preprocessing = @options[:preprocessing]
|
|
59
|
+
return if preprocessing.nil?
|
|
60
|
+
return if preprocessing.is_a?(ImagePreprocessing)
|
|
61
|
+
return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
|
|
62
|
+
preprocessing.is_a?(Hash)
|
|
63
|
+
|
|
64
|
+
raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Chunking configuration
|
|
69
|
+
#
|
|
70
|
+
# @example
|
|
71
|
+
# chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
|
|
72
|
+
#
|
|
73
|
+
class Chunking
|
|
74
|
+
attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
|
|
75
|
+
|
|
76
|
+
def initialize(
|
|
77
|
+
max_chars: nil,
|
|
78
|
+
max_overlap: nil,
|
|
79
|
+
preset: nil,
|
|
80
|
+
embedding: nil,
|
|
81
|
+
chunk_size: nil,
|
|
82
|
+
chunk_overlap: nil,
|
|
83
|
+
enabled: true
|
|
84
|
+
)
|
|
85
|
+
resolved_size = chunk_size || max_chars || 1000
|
|
86
|
+
resolved_overlap = chunk_overlap || max_overlap || 200
|
|
87
|
+
|
|
88
|
+
@max_chars = resolved_size.to_i
|
|
89
|
+
@max_overlap = resolved_overlap.to_i
|
|
90
|
+
@preset = preset&.to_s
|
|
91
|
+
@embedding = normalize_embedding(embedding)
|
|
92
|
+
@enabled = boolean_or_nil(enabled)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def to_h
|
|
96
|
+
config = {
|
|
97
|
+
max_chars: @max_chars,
|
|
98
|
+
max_overlap: @max_overlap,
|
|
99
|
+
preset: @preset,
|
|
100
|
+
embedding: @embedding&.to_h
|
|
101
|
+
}.compact
|
|
102
|
+
# @type var config: Hash[Symbol, untyped]
|
|
103
|
+
config[:enabled] = @enabled unless @enabled.nil?
|
|
104
|
+
config
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
private
|
|
108
|
+
|
|
109
|
+
def normalize_embedding(value)
|
|
110
|
+
return nil if value.nil?
|
|
111
|
+
return value if value.is_a?(Embedding)
|
|
112
|
+
return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
113
|
+
|
|
114
|
+
raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def boolean_or_nil(value)
|
|
118
|
+
return nil if value.nil?
|
|
119
|
+
|
|
120
|
+
value ? true : false
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Embedding model configuration for document chunking
|
|
125
|
+
class Embedding
|
|
126
|
+
attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
|
|
127
|
+
|
|
128
|
+
def initialize(
|
|
129
|
+
model: { type: :preset, name: 'balanced' },
|
|
130
|
+
normalize: true,
|
|
131
|
+
batch_size: 32,
|
|
132
|
+
show_download_progress: false,
|
|
133
|
+
cache_dir: nil
|
|
134
|
+
)
|
|
135
|
+
@model = normalize_model(model)
|
|
136
|
+
@normalize = boolean_or_nil(normalize)
|
|
137
|
+
@batch_size = batch_size&.to_i
|
|
138
|
+
@show_download_progress = boolean_or_nil(show_download_progress)
|
|
139
|
+
@cache_dir = cache_dir&.to_s
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def to_h
|
|
143
|
+
{
|
|
144
|
+
model: @model,
|
|
145
|
+
normalize: @normalize,
|
|
146
|
+
batch_size: @batch_size,
|
|
147
|
+
show_download_progress: @show_download_progress,
|
|
148
|
+
cache_dir: @cache_dir
|
|
149
|
+
}.compact
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
private
|
|
153
|
+
|
|
154
|
+
def normalize_model(model)
|
|
155
|
+
normalized = if model.respond_to?(:to_h)
|
|
156
|
+
model.to_h
|
|
157
|
+
else
|
|
158
|
+
model
|
|
159
|
+
end
|
|
160
|
+
raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
|
|
161
|
+
|
|
162
|
+
normalized.transform_keys(&:to_sym)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def boolean_or_nil(value)
|
|
166
|
+
return nil if value.nil?
|
|
167
|
+
|
|
168
|
+
value ? true : false
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Language detection configuration
|
|
173
|
+
#
|
|
174
|
+
# @example
|
|
175
|
+
# lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
|
|
176
|
+
#
|
|
177
|
+
class LanguageDetection
|
|
178
|
+
attr_reader :enabled, :min_confidence, :detect_multiple
|
|
179
|
+
|
|
180
|
+
def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
|
|
181
|
+
@enabled = enabled ? true : false
|
|
182
|
+
@min_confidence = min_confidence.to_f
|
|
183
|
+
@detect_multiple = detect_multiple ? true : false
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def to_h
|
|
187
|
+
{
|
|
188
|
+
enabled: @enabled,
|
|
189
|
+
min_confidence: @min_confidence,
|
|
190
|
+
detect_multiple: @detect_multiple
|
|
191
|
+
}
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# PDF-specific options
|
|
196
|
+
#
|
|
197
|
+
# @example
|
|
198
|
+
# pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
|
|
199
|
+
#
|
|
200
|
+
class PDF
|
|
201
|
+
attr_reader :extract_images, :passwords, :extract_metadata
|
|
202
|
+
|
|
203
|
+
def initialize(
|
|
204
|
+
extract_images: false,
|
|
205
|
+
passwords: nil,
|
|
206
|
+
extract_metadata: true
|
|
207
|
+
)
|
|
208
|
+
@extract_images = extract_images ? true : false
|
|
209
|
+
@passwords = if passwords.is_a?(Array)
|
|
210
|
+
passwords.map(&:to_s)
|
|
211
|
+
else
|
|
212
|
+
(passwords ? [passwords.to_s] : nil)
|
|
213
|
+
end
|
|
214
|
+
@extract_metadata = extract_metadata ? true : false
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def to_h
|
|
218
|
+
{
|
|
219
|
+
extract_images: @extract_images,
|
|
220
|
+
passwords: @passwords,
|
|
221
|
+
extract_metadata: @extract_metadata
|
|
222
|
+
}.compact
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# Image extraction configuration
|
|
227
|
+
#
|
|
228
|
+
# @example
|
|
229
|
+
# image = ImageExtraction.new(extract_images: true, target_dpi: 300)
|
|
230
|
+
#
|
|
231
|
+
# @example With auto-adjust DPI
|
|
232
|
+
# image = ImageExtraction.new(
|
|
233
|
+
# extract_images: true,
|
|
234
|
+
# auto_adjust_dpi: true,
|
|
235
|
+
# min_dpi: 150,
|
|
236
|
+
# max_dpi: 600
|
|
237
|
+
# )
|
|
238
|
+
#
|
|
239
|
+
class ImageExtraction
|
|
240
|
+
attr_reader :extract_images, :target_dpi, :max_image_dimension,
|
|
241
|
+
:auto_adjust_dpi, :min_dpi, :max_dpi
|
|
242
|
+
|
|
243
|
+
def initialize(
|
|
244
|
+
extract_images: true,
|
|
245
|
+
target_dpi: 300,
|
|
246
|
+
max_image_dimension: 2000,
|
|
247
|
+
auto_adjust_dpi: true,
|
|
248
|
+
min_dpi: 150,
|
|
249
|
+
max_dpi: 600
|
|
250
|
+
)
|
|
251
|
+
@extract_images = extract_images ? true : false
|
|
252
|
+
@target_dpi = target_dpi.to_i
|
|
253
|
+
@max_image_dimension = max_image_dimension.to_i
|
|
254
|
+
@auto_adjust_dpi = auto_adjust_dpi ? true : false
|
|
255
|
+
@min_dpi = min_dpi.to_i
|
|
256
|
+
@max_dpi = max_dpi.to_i
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def to_h
|
|
260
|
+
{
|
|
261
|
+
extract_images: @extract_images,
|
|
262
|
+
target_dpi: @target_dpi,
|
|
263
|
+
max_image_dimension: @max_image_dimension,
|
|
264
|
+
auto_adjust_dpi: @auto_adjust_dpi,
|
|
265
|
+
min_dpi: @min_dpi,
|
|
266
|
+
max_dpi: @max_dpi
|
|
267
|
+
}
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# Image preprocessing configuration for OCR
|
|
272
|
+
#
|
|
273
|
+
# @example Basic preprocessing
|
|
274
|
+
# preprocessing = ImagePreprocessing.new(
|
|
275
|
+
# binarization_method: "otsu",
|
|
276
|
+
# denoise: true
|
|
277
|
+
# )
|
|
278
|
+
#
|
|
279
|
+
# @example Advanced preprocessing
|
|
280
|
+
# preprocessing = ImagePreprocessing.new(
|
|
281
|
+
# target_dpi: 600,
|
|
282
|
+
# auto_rotate: true,
|
|
283
|
+
# deskew: true,
|
|
284
|
+
# denoise: true,
|
|
285
|
+
# contrast_enhance: true,
|
|
286
|
+
# binarization_method: "sauvola",
|
|
287
|
+
# invert_colors: false
|
|
288
|
+
# )
|
|
289
|
+
#
|
|
290
|
+
class ImagePreprocessing
|
|
291
|
+
attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
|
|
292
|
+
:contrast_enhance, :binarization_method, :invert_colors
|
|
293
|
+
|
|
294
|
+
def initialize(
|
|
295
|
+
target_dpi: 300,
|
|
296
|
+
auto_rotate: true,
|
|
297
|
+
deskew: true,
|
|
298
|
+
denoise: false,
|
|
299
|
+
contrast_enhance: true,
|
|
300
|
+
binarization_method: 'otsu',
|
|
301
|
+
invert_colors: false
|
|
302
|
+
)
|
|
303
|
+
@target_dpi = target_dpi.to_i
|
|
304
|
+
@auto_rotate = auto_rotate ? true : false
|
|
305
|
+
@deskew = deskew ? true : false
|
|
306
|
+
@denoise = denoise ? true : false
|
|
307
|
+
@contrast_enhance = contrast_enhance ? true : false
|
|
308
|
+
@binarization_method = binarization_method.to_s
|
|
309
|
+
@invert_colors = invert_colors ? true : false
|
|
310
|
+
|
|
311
|
+
valid_methods = %w[otsu sauvola adaptive]
|
|
312
|
+
return if valid_methods.include?(@binarization_method)
|
|
313
|
+
|
|
314
|
+
raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
def to_h
|
|
318
|
+
{
|
|
319
|
+
target_dpi: @target_dpi,
|
|
320
|
+
auto_rotate: @auto_rotate,
|
|
321
|
+
deskew: @deskew,
|
|
322
|
+
denoise: @denoise,
|
|
323
|
+
contrast_enhance: @contrast_enhance,
|
|
324
|
+
binarization_method: @binarization_method,
|
|
325
|
+
invert_colors: @invert_colors
|
|
326
|
+
}
|
|
327
|
+
end
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# Token reduction configuration
|
|
331
|
+
#
|
|
332
|
+
# @example Disable token reduction
|
|
333
|
+
# token = TokenReduction.new(mode: "off")
|
|
334
|
+
#
|
|
335
|
+
# @example Light reduction
|
|
336
|
+
# token = TokenReduction.new(mode: "light", preserve_important_words: true)
|
|
337
|
+
#
|
|
338
|
+
# @example Aggressive reduction
|
|
339
|
+
# token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
|
|
340
|
+
#
|
|
341
|
+
class TokenReduction
|
|
342
|
+
attr_reader :mode, :preserve_important_words
|
|
343
|
+
|
|
344
|
+
def initialize(mode: 'off', preserve_important_words: true)
|
|
345
|
+
@mode = mode.to_s
|
|
346
|
+
@preserve_important_words = preserve_important_words ? true : false
|
|
347
|
+
|
|
348
|
+
valid_modes = %w[off light moderate aggressive maximum]
|
|
349
|
+
return if valid_modes.include?(@mode)
|
|
350
|
+
|
|
351
|
+
raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
def to_h
|
|
355
|
+
{
|
|
356
|
+
mode: @mode,
|
|
357
|
+
preserve_important_words: @preserve_important_words
|
|
358
|
+
}
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
# HTML preprocessing configuration for content extraction
|
|
363
|
+
class HtmlPreprocessing
|
|
364
|
+
attr_reader :enabled, :preset, :remove_navigation, :remove_forms
|
|
365
|
+
|
|
366
|
+
def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
|
|
367
|
+
@enabled = boolean_or_nil(enabled)
|
|
368
|
+
@preset = preset&.to_sym
|
|
369
|
+
@remove_navigation = boolean_or_nil(remove_navigation)
|
|
370
|
+
@remove_forms = boolean_or_nil(remove_forms)
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
def to_h
|
|
374
|
+
{
|
|
375
|
+
enabled: @enabled,
|
|
376
|
+
preset: @preset,
|
|
377
|
+
remove_navigation: @remove_navigation,
|
|
378
|
+
remove_forms: @remove_forms
|
|
379
|
+
}.compact
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
private
|
|
383
|
+
|
|
384
|
+
def boolean_or_nil(value)
|
|
385
|
+
return nil if value.nil?
|
|
386
|
+
|
|
387
|
+
value ? true : false
|
|
388
|
+
end
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
# HTML rendering options for document conversion
|
|
392
|
+
class HtmlOptions
|
|
393
|
+
attr_reader :options
|
|
394
|
+
|
|
395
|
+
def initialize(**options)
|
|
396
|
+
normalized = options.transform_keys(&:to_sym)
|
|
397
|
+
symbol_keys = %i[
|
|
398
|
+
heading_style
|
|
399
|
+
code_block_style
|
|
400
|
+
highlight_style
|
|
401
|
+
list_indent_type
|
|
402
|
+
newline_style
|
|
403
|
+
whitespace_mode
|
|
404
|
+
]
|
|
405
|
+
symbol_keys.each do |key|
|
|
406
|
+
normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
|
|
407
|
+
end
|
|
408
|
+
if normalized[:preprocessing].is_a?(Hash)
|
|
409
|
+
normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
|
|
410
|
+
end
|
|
411
|
+
@options = normalized
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
def to_h
|
|
415
|
+
@options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
|
|
416
|
+
end
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
# YAKE keyword extraction parameters
|
|
420
|
+
class KeywordYakeParams
|
|
421
|
+
attr_reader :window_size
|
|
422
|
+
|
|
423
|
+
def initialize(window_size: 2)
|
|
424
|
+
@window_size = window_size.to_i
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
def to_h
|
|
428
|
+
{ window_size: @window_size }
|
|
429
|
+
end
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
# RAKE keyword extraction parameters
|
|
433
|
+
class KeywordRakeParams
|
|
434
|
+
attr_reader :min_word_length, :max_words_per_phrase
|
|
435
|
+
|
|
436
|
+
def initialize(min_word_length: 1, max_words_per_phrase: 3)
|
|
437
|
+
@min_word_length = min_word_length.to_i
|
|
438
|
+
@max_words_per_phrase = max_words_per_phrase.to_i
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
def to_h
|
|
442
|
+
{
|
|
443
|
+
min_word_length: @min_word_length,
|
|
444
|
+
max_words_per_phrase: @max_words_per_phrase
|
|
445
|
+
}
|
|
446
|
+
end
|
|
447
|
+
end
|
|
448
|
+
|
|
449
|
+
# Keyword extraction configuration for document analysis
|
|
450
|
+
class Keywords
|
|
451
|
+
attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
|
|
452
|
+
:language, :yake_params, :rake_params
|
|
453
|
+
|
|
454
|
+
def initialize(
|
|
455
|
+
algorithm: nil,
|
|
456
|
+
max_keywords: nil,
|
|
457
|
+
min_score: nil,
|
|
458
|
+
ngram_range: nil,
|
|
459
|
+
language: nil,
|
|
460
|
+
yake_params: nil,
|
|
461
|
+
rake_params: nil
|
|
462
|
+
)
|
|
463
|
+
@algorithm = algorithm&.to_s
|
|
464
|
+
@max_keywords = max_keywords&.to_i
|
|
465
|
+
@min_score = min_score&.to_f
|
|
466
|
+
@ngram_range = ngram_range&.map(&:to_i)
|
|
467
|
+
@language = language&.to_s
|
|
468
|
+
@yake_params = normalize_nested(yake_params, KeywordYakeParams)
|
|
469
|
+
@rake_params = normalize_nested(rake_params, KeywordRakeParams)
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
def to_h
|
|
473
|
+
{
|
|
474
|
+
algorithm: @algorithm,
|
|
475
|
+
max_keywords: @max_keywords,
|
|
476
|
+
min_score: @min_score,
|
|
477
|
+
ngram_range: @ngram_range,
|
|
478
|
+
language: @language,
|
|
479
|
+
yake_params: @yake_params&.to_h,
|
|
480
|
+
rake_params: @rake_params&.to_h
|
|
481
|
+
}.compact
|
|
482
|
+
end
|
|
483
|
+
|
|
484
|
+
private
|
|
485
|
+
|
|
486
|
+
def normalize_nested(value, klass)
|
|
487
|
+
return nil if value.nil?
|
|
488
|
+
return value if value.is_a?(klass)
|
|
489
|
+
return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
490
|
+
|
|
491
|
+
raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
|
|
492
|
+
end
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
# Page tracking configuration for multi-page documents
|
|
496
|
+
#
|
|
497
|
+
# @example Enable page extraction
|
|
498
|
+
# pages = PageConfig.new(extract_pages: true)
|
|
499
|
+
#
|
|
500
|
+
# @example Enable page markers in content
|
|
501
|
+
# pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
|
|
502
|
+
#
|
|
503
|
+
class PageConfig
|
|
504
|
+
attr_reader :extract_pages, :insert_page_markers, :marker_format
|
|
505
|
+
|
|
506
|
+
def initialize(
|
|
507
|
+
extract_pages: false,
|
|
508
|
+
insert_page_markers: false,
|
|
509
|
+
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
|
|
510
|
+
)
|
|
511
|
+
@extract_pages = extract_pages ? true : false
|
|
512
|
+
@insert_page_markers = insert_page_markers ? true : false
|
|
513
|
+
@marker_format = marker_format.to_s
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
def to_h
|
|
517
|
+
{
|
|
518
|
+
extract_pages: @extract_pages,
|
|
519
|
+
insert_page_markers: @insert_page_markers,
|
|
520
|
+
marker_format: @marker_format
|
|
521
|
+
}
|
|
522
|
+
end
|
|
523
|
+
end
|
|
524
|
+
|
|
525
|
+
# Post-processor configuration
|
|
526
|
+
#
|
|
527
|
+
# @example Enable all post-processors
|
|
528
|
+
# postprocessor = PostProcessor.new(enabled: true)
|
|
529
|
+
#
|
|
530
|
+
# @example Enable specific processors
|
|
531
|
+
# postprocessor = PostProcessor.new(
|
|
532
|
+
# enabled: true,
|
|
533
|
+
# enabled_processors: ["quality", "formatting"]
|
|
534
|
+
# )
|
|
535
|
+
#
|
|
536
|
+
# @example Disable specific processors
|
|
537
|
+
# postprocessor = PostProcessor.new(
|
|
538
|
+
# enabled: true,
|
|
539
|
+
# disabled_processors: ["token_reduction"]
|
|
540
|
+
# )
|
|
541
|
+
#
|
|
542
|
+
class PostProcessor
|
|
543
|
+
attr_reader :enabled, :enabled_processors, :disabled_processors
|
|
544
|
+
|
|
545
|
+
def initialize(
|
|
546
|
+
enabled: true,
|
|
547
|
+
enabled_processors: nil,
|
|
548
|
+
disabled_processors: nil
|
|
549
|
+
)
|
|
550
|
+
@enabled = enabled ? true : false
|
|
551
|
+
@enabled_processors = enabled_processors&.map(&:to_s)
|
|
552
|
+
@disabled_processors = disabled_processors&.map(&:to_s)
|
|
553
|
+
end
|
|
554
|
+
|
|
555
|
+
def to_h
|
|
556
|
+
{
|
|
557
|
+
enabled: @enabled,
|
|
558
|
+
enabled_processors: @enabled_processors,
|
|
559
|
+
disabled_processors: @disabled_processors
|
|
560
|
+
}.compact
|
|
561
|
+
end
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
# Main extraction configuration
|
|
565
|
+
#
|
|
566
|
+
# @example Basic usage
|
|
567
|
+
# config = Extraction.new(use_cache: true, force_ocr: true)
|
|
568
|
+
#
|
|
569
|
+
# @example With OCR
|
|
570
|
+
# ocr = Config::OCR.new(backend: "tesseract", language: "eng")
|
|
571
|
+
# config = Extraction.new(ocr: ocr)
|
|
572
|
+
#
|
|
573
|
+
# @example With image extraction
|
|
574
|
+
# image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
|
|
575
|
+
# config = Extraction.new(image_extraction: image)
|
|
576
|
+
#
|
|
577
|
+
# @example With preprocessing
|
|
578
|
+
# preprocessing = Config::ImagePreprocessing.new(
|
|
579
|
+
# binarization_method: "sauvola",
|
|
580
|
+
# denoise: true
|
|
581
|
+
# )
|
|
582
|
+
# config = Extraction.new(image_preprocessing: preprocessing)
|
|
583
|
+
#
|
|
584
|
+
# @example With post-processing
|
|
585
|
+
# postprocessor = Config::PostProcessor.new(
|
|
586
|
+
# enabled: true,
|
|
587
|
+
# enabled_processors: ["quality"]
|
|
588
|
+
# )
|
|
589
|
+
# config = Extraction.new(postprocessor: postprocessor)
|
|
590
|
+
#
|
|
591
|
+
# @example With all options
|
|
592
|
+
# config = Extraction.new(
|
|
593
|
+
# use_cache: true,
|
|
594
|
+
# enable_quality_processing: true,
|
|
595
|
+
# force_ocr: false,
|
|
596
|
+
# ocr: Config::OCR.new(language: "deu"),
|
|
597
|
+
# chunking: Config::Chunking.new(max_chars: 500),
|
|
598
|
+
# language_detection: Config::LanguageDetection.new(enabled: true),
|
|
599
|
+
# pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
|
|
600
|
+
# image_extraction: Config::ImageExtraction.new(target_dpi: 600),
|
|
601
|
+
# image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
|
|
602
|
+
# postprocessor: Config::PostProcessor.new(enabled: true)
|
|
603
|
+
# )
|
|
604
|
+
#
|
|
605
|
+
class Extraction
|
|
606
|
+
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
607
|
+
:ocr, :chunking, :language_detection, :pdf_options,
|
|
608
|
+
:image_extraction, :image_preprocessing, :postprocessor,
|
|
609
|
+
:token_reduction, :keywords, :html_options, :pages,
|
|
610
|
+
:max_concurrent_extractions
|
|
611
|
+
|
|
612
|
+
# Load configuration from a file.
|
|
613
|
+
#
|
|
614
|
+
# Detects the file format from the extension (.toml, .yaml, .json)
|
|
615
|
+
# and loads the configuration accordingly.
|
|
616
|
+
#
|
|
617
|
+
# @param path [String] Path to the configuration file
|
|
618
|
+
# @return [Kreuzberg::Config::Extraction] Loaded configuration object
|
|
619
|
+
#
|
|
620
|
+
# @example Load from TOML
|
|
621
|
+
# config = Kreuzberg::Config::Extraction.from_file("config.toml")
|
|
622
|
+
#
|
|
623
|
+
# @example Load from YAML
|
|
624
|
+
# config = Kreuzberg::Config::Extraction.from_file("config.yaml")
|
|
625
|
+
#
|
|
626
|
+
def self.from_file(path)
|
|
627
|
+
hash = Kreuzberg._config_from_file_native(path)
|
|
628
|
+
# Convert string keys to symbols for keyword arguments
|
|
629
|
+
new(**hash.transform_keys(&:to_sym))
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
# Discover configuration file in current or parent directories.
|
|
633
|
+
#
|
|
634
|
+
# Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
|
|
635
|
+
# directory and parent directories.
|
|
636
|
+
#
|
|
637
|
+
# @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
|
|
638
|
+
#
|
|
639
|
+
# @example
|
|
640
|
+
# config = Kreuzberg::Config::Extraction.discover
|
|
641
|
+
# if config
|
|
642
|
+
# # Use discovered config
|
|
643
|
+
# end
|
|
644
|
+
#
|
|
645
|
+
def self.discover
|
|
646
|
+
hash = Kreuzberg._config_discover_native
|
|
647
|
+
return nil if hash.nil?
|
|
648
|
+
|
|
649
|
+
# Convert string keys to symbols for keyword arguments
|
|
650
|
+
new(**hash.transform_keys(&:to_sym))
|
|
651
|
+
end
|
|
652
|
+
|
|
653
|
+
def initialize(
|
|
654
|
+
use_cache: true,
|
|
655
|
+
enable_quality_processing: false,
|
|
656
|
+
force_ocr: false,
|
|
657
|
+
ocr: nil,
|
|
658
|
+
chunking: nil,
|
|
659
|
+
language_detection: nil,
|
|
660
|
+
pdf_options: nil,
|
|
661
|
+
image_extraction: nil,
|
|
662
|
+
image_preprocessing: nil,
|
|
663
|
+
postprocessor: nil,
|
|
664
|
+
token_reduction: nil,
|
|
665
|
+
keywords: nil,
|
|
666
|
+
html_options: nil,
|
|
667
|
+
pages: nil,
|
|
668
|
+
max_concurrent_extractions: nil
|
|
669
|
+
)
|
|
670
|
+
@use_cache = use_cache ? true : false
|
|
671
|
+
@enable_quality_processing = enable_quality_processing ? true : false
|
|
672
|
+
@force_ocr = force_ocr ? true : false
|
|
673
|
+
@ocr = normalize_config(ocr, OCR)
|
|
674
|
+
@chunking = normalize_config(chunking, Chunking)
|
|
675
|
+
@language_detection = normalize_config(language_detection, LanguageDetection)
|
|
676
|
+
@pdf_options = normalize_config(pdf_options, PDF)
|
|
677
|
+
@image_extraction = normalize_config(image_extraction, ImageExtraction)
|
|
678
|
+
@image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
|
|
679
|
+
@postprocessor = normalize_config(postprocessor, PostProcessor)
|
|
680
|
+
@token_reduction = normalize_config(token_reduction, TokenReduction)
|
|
681
|
+
@keywords = normalize_config(keywords, Keywords)
|
|
682
|
+
@html_options = normalize_config(html_options, HtmlOptions)
|
|
683
|
+
@pages = normalize_config(pages, PageConfig)
|
|
684
|
+
@max_concurrent_extractions = max_concurrent_extractions&.to_i
|
|
685
|
+
end
|
|
686
|
+
|
|
687
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
688
|
+
def to_h
|
|
689
|
+
{
|
|
690
|
+
use_cache: @use_cache,
|
|
691
|
+
enable_quality_processing: @enable_quality_processing,
|
|
692
|
+
force_ocr: @force_ocr,
|
|
693
|
+
ocr: @ocr&.to_h,
|
|
694
|
+
chunking: @chunking&.to_h,
|
|
695
|
+
language_detection: @language_detection&.to_h,
|
|
696
|
+
pdf_options: @pdf_options&.to_h,
|
|
697
|
+
image_extraction: @image_extraction&.to_h,
|
|
698
|
+
image_preprocessing: @image_preprocessing&.to_h,
|
|
699
|
+
postprocessor: @postprocessor&.to_h,
|
|
700
|
+
token_reduction: @token_reduction&.to_h,
|
|
701
|
+
keywords: @keywords&.to_h,
|
|
702
|
+
html_options: @html_options&.to_h,
|
|
703
|
+
pages: @pages&.to_h,
|
|
704
|
+
max_concurrent_extractions: @max_concurrent_extractions
|
|
705
|
+
}.compact
|
|
706
|
+
end
|
|
707
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
708
|
+
|
|
709
|
+
private
|
|
710
|
+
|
|
711
|
+
def normalize_config(value, klass)
|
|
712
|
+
return nil if value.nil?
|
|
713
|
+
return value if value.is_a?(klass)
|
|
714
|
+
# Convert string keys to symbols for keyword arguments
|
|
715
|
+
return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
716
|
+
|
|
717
|
+
raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
|
|
718
|
+
end
|
|
719
|
+
end
|
|
720
|
+
|
|
721
|
+
# Backwards compatibility aliases
|
|
722
|
+
Ocr = OCR
|
|
723
|
+
end
|
|
724
|
+
end
|