kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +105 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +73 -4
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
|
@@ -1,80 +1,80 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rbconfig'
|
|
4
|
-
require 'open3'
|
|
5
|
-
|
|
6
|
-
module Kreuzberg
|
|
7
|
-
# Configures library paths for dynamic linking on different platforms.
|
|
8
|
-
module SetupLibPath
|
|
9
|
-
module_function
|
|
10
|
-
|
|
11
|
-
def configure
|
|
12
|
-
lib_dir = File.expand_path('..', __dir__ || '.')
|
|
13
|
-
host_os = RbConfig::CONFIG['host_os']
|
|
14
|
-
|
|
15
|
-
case host_os
|
|
16
|
-
when /darwin/
|
|
17
|
-
prepend_env('DYLD_LIBRARY_PATH', lib_dir)
|
|
18
|
-
prepend_env('DYLD_FALLBACK_LIBRARY_PATH', "#{lib_dir}:/usr/local/lib:/usr/lib")
|
|
19
|
-
fix_macos_install_name(lib_dir)
|
|
20
|
-
when /linux/
|
|
21
|
-
prepend_env('LD_LIBRARY_PATH', lib_dir)
|
|
22
|
-
when /mswin|mingw|cygwin/
|
|
23
|
-
prepend_env('PATH', lib_dir, separator: ';')
|
|
24
|
-
end
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
def prepend_env(key, value, separator: ':')
|
|
28
|
-
current = ENV.fetch(key, nil)
|
|
29
|
-
return if current&.split(separator)&.include?(value)
|
|
30
|
-
|
|
31
|
-
ENV[key] = current.nil? || current.empty? ? value : "#{value}#{separator}#{current}"
|
|
32
|
-
end
|
|
33
|
-
private_class_method :prepend_env
|
|
34
|
-
|
|
35
|
-
def fix_macos_install_name(lib_dir)
|
|
36
|
-
bundle = macos_bundle(lib_dir)
|
|
37
|
-
return unless bundle
|
|
38
|
-
|
|
39
|
-
ensure_install_name(bundle)
|
|
40
|
-
ensure_loader_rpath(bundle)
|
|
41
|
-
rescue Errno::ENOENT, IOError
|
|
42
|
-
# Tool not available (e.g., on CI). The dynamic loader can still use the updated env vars.
|
|
43
|
-
end
|
|
44
|
-
private_class_method :fix_macos_install_name
|
|
45
|
-
|
|
46
|
-
def macos_bundle(lib_dir)
|
|
47
|
-
bundle = File.join(lib_dir, 'kreuzberg_rb.bundle')
|
|
48
|
-
pdfium = File.join(lib_dir, 'libpdfium.dylib')
|
|
49
|
-
return unless File.exist?(bundle) && File.exist?(pdfium)
|
|
50
|
-
|
|
51
|
-
bundle
|
|
52
|
-
end
|
|
53
|
-
private_class_method :macos_bundle
|
|
54
|
-
|
|
55
|
-
def ensure_install_name(bundle)
|
|
56
|
-
output, status = Open3.capture2('otool', '-L', bundle)
|
|
57
|
-
return unless status.success?
|
|
58
|
-
|
|
59
|
-
replacements = {
|
|
60
|
-
'./libpdfium.dylib' => '@loader_path/libpdfium.dylib',
|
|
61
|
-
'@rpath/libpdfium.dylib' => '@loader_path/libpdfium.dylib'
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
replacements.each do |current, desired|
|
|
65
|
-
next unless output.include?(current)
|
|
66
|
-
|
|
67
|
-
Open3.capture2('install_name_tool', '-change', current, desired, bundle)
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
private_class_method :ensure_install_name
|
|
71
|
-
|
|
72
|
-
def ensure_loader_rpath(bundle)
|
|
73
|
-
rpath_output, rpath_status = Open3.capture2('otool', '-l', bundle)
|
|
74
|
-
return unless rpath_status.success? && !rpath_output.include?('@loader_path')
|
|
75
|
-
|
|
76
|
-
Open3.capture2('install_name_tool', '-add_rpath', '@loader_path', bundle)
|
|
77
|
-
end
|
|
78
|
-
private_class_method :ensure_loader_rpath
|
|
79
|
-
end
|
|
80
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'rbconfig'
|
|
4
|
+
require 'open3'
|
|
5
|
+
|
|
6
|
+
module Kreuzberg
|
|
7
|
+
# Configures library paths for dynamic linking on different platforms.
|
|
8
|
+
module SetupLibPath
|
|
9
|
+
module_function
|
|
10
|
+
|
|
11
|
+
def configure
|
|
12
|
+
lib_dir = File.expand_path('..', __dir__ || '.')
|
|
13
|
+
host_os = RbConfig::CONFIG['host_os']
|
|
14
|
+
|
|
15
|
+
case host_os
|
|
16
|
+
when /darwin/
|
|
17
|
+
prepend_env('DYLD_LIBRARY_PATH', lib_dir)
|
|
18
|
+
prepend_env('DYLD_FALLBACK_LIBRARY_PATH', "#{lib_dir}:/usr/local/lib:/usr/lib")
|
|
19
|
+
fix_macos_install_name(lib_dir)
|
|
20
|
+
when /linux/
|
|
21
|
+
prepend_env('LD_LIBRARY_PATH', lib_dir)
|
|
22
|
+
when /mswin|mingw|cygwin/
|
|
23
|
+
prepend_env('PATH', lib_dir, separator: ';')
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def prepend_env(key, value, separator: ':')
|
|
28
|
+
current = ENV.fetch(key, nil)
|
|
29
|
+
return if current&.split(separator)&.include?(value)
|
|
30
|
+
|
|
31
|
+
ENV[key] = current.nil? || current.empty? ? value : "#{value}#{separator}#{current}"
|
|
32
|
+
end
|
|
33
|
+
private_class_method :prepend_env
|
|
34
|
+
|
|
35
|
+
def fix_macos_install_name(lib_dir)
|
|
36
|
+
bundle = macos_bundle(lib_dir)
|
|
37
|
+
return unless bundle
|
|
38
|
+
|
|
39
|
+
ensure_install_name(bundle)
|
|
40
|
+
ensure_loader_rpath(bundle)
|
|
41
|
+
rescue Errno::ENOENT, IOError
|
|
42
|
+
# Tool not available (e.g., on CI). The dynamic loader can still use the updated env vars.
|
|
43
|
+
end
|
|
44
|
+
private_class_method :fix_macos_install_name
|
|
45
|
+
|
|
46
|
+
def macos_bundle(lib_dir)
|
|
47
|
+
bundle = File.join(lib_dir, 'kreuzberg_rb.bundle')
|
|
48
|
+
pdfium = File.join(lib_dir, 'libpdfium.dylib')
|
|
49
|
+
return unless File.exist?(bundle) && File.exist?(pdfium)
|
|
50
|
+
|
|
51
|
+
bundle
|
|
52
|
+
end
|
|
53
|
+
private_class_method :macos_bundle
|
|
54
|
+
|
|
55
|
+
def ensure_install_name(bundle)
|
|
56
|
+
output, status = Open3.capture2('otool', '-L', bundle)
|
|
57
|
+
return unless status.success?
|
|
58
|
+
|
|
59
|
+
replacements = {
|
|
60
|
+
'./libpdfium.dylib' => '@loader_path/libpdfium.dylib',
|
|
61
|
+
'@rpath/libpdfium.dylib' => '@loader_path/libpdfium.dylib'
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
replacements.each do |current, desired|
|
|
65
|
+
next unless output.include?(current)
|
|
66
|
+
|
|
67
|
+
Open3.capture2('install_name_tool', '-change', current, desired, bundle)
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
private_class_method :ensure_install_name
|
|
71
|
+
|
|
72
|
+
def ensure_loader_rpath(bundle)
|
|
73
|
+
rpath_output, rpath_status = Open3.capture2('otool', '-l', bundle)
|
|
74
|
+
return unless rpath_status.success? && !rpath_output.include?('@loader_path')
|
|
75
|
+
|
|
76
|
+
Open3.capture2('install_name_tool', '-add_rpath', '@loader_path', bundle)
|
|
77
|
+
end
|
|
78
|
+
private_class_method :ensure_loader_rpath
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -1,89 +1,89 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Kreuzberg
|
|
4
|
-
# Validator protocol interface.
|
|
5
|
-
#
|
|
6
|
-
# This module defines the protocol that all Ruby validators must implement
|
|
7
|
-
# to be registered with the Rust core via the FFI bridge.
|
|
8
|
-
#
|
|
9
|
-
# Validators are called during extraction to validate results. If validation fails,
|
|
10
|
-
# the validator should raise a Kreuzberg::Errors::ValidationError, which will
|
|
11
|
-
# cause the extraction to fail.
|
|
12
|
-
#
|
|
13
|
-
# @example Implementing a minimum length validator
|
|
14
|
-
# class MinimumLengthValidator
|
|
15
|
-
# include Kreuzberg::ValidatorProtocol
|
|
16
|
-
#
|
|
17
|
-
# def initialize(min_length = 10)
|
|
18
|
-
# @min_length = min_length
|
|
19
|
-
# end
|
|
20
|
-
#
|
|
21
|
-
# def call(result)
|
|
22
|
-
# if result["content"].length < @min_length
|
|
23
|
-
# raise Kreuzberg::Errors::ValidationError.new(
|
|
24
|
-
# "Content too short: #{result["content"].length} < #{@min_length}"
|
|
25
|
-
# )
|
|
26
|
-
# end
|
|
27
|
-
# end
|
|
28
|
-
# end
|
|
29
|
-
#
|
|
30
|
-
# Kreuzberg.register_validator("min_length", MinimumLengthValidator.new(100))
|
|
31
|
-
#
|
|
32
|
-
# @example Implementing a content quality validator
|
|
33
|
-
# class QualityValidator
|
|
34
|
-
# include Kreuzberg::ValidatorProtocol
|
|
35
|
-
#
|
|
36
|
-
# def call(result)
|
|
37
|
-
# # Check if content has sufficient quality
|
|
38
|
-
# if result["content"].strip.empty?
|
|
39
|
-
# raise Kreuzberg::Errors::ValidationError.new("Empty content extracted")
|
|
40
|
-
# end
|
|
41
|
-
#
|
|
42
|
-
# # Check if metadata is present
|
|
43
|
-
# if result["metadata"].empty?
|
|
44
|
-
# raise Kreuzberg::Errors::ValidationError.new("No metadata extracted")
|
|
45
|
-
# end
|
|
46
|
-
# end
|
|
47
|
-
# end
|
|
48
|
-
#
|
|
49
|
-
# Kreuzberg.register_validator("quality", QualityValidator.new)
|
|
50
|
-
#
|
|
51
|
-
# @example Using a Proc as a validator
|
|
52
|
-
# Kreuzberg.register_validator("not_empty", ->(result) {
|
|
53
|
-
# if result["content"].strip.empty?
|
|
54
|
-
# raise Kreuzberg::Errors::ValidationError.new("Content cannot be empty")
|
|
55
|
-
# end
|
|
56
|
-
# })
|
|
57
|
-
#
|
|
58
|
-
module ValidatorProtocol
|
|
59
|
-
# Validate an extraction result.
|
|
60
|
-
#
|
|
61
|
-
# This method is called during extraction to validate results. If validation fails,
|
|
62
|
-
# raise a Kreuzberg::Errors::ValidationError with a descriptive message explaining
|
|
63
|
-
# why validation failed. If validation passes, return without raising.
|
|
64
|
-
#
|
|
65
|
-
# The validator receives the extraction result as a hash with the same structure
|
|
66
|
-
# as post-processors (see PostProcessorProtocol for details).
|
|
67
|
-
#
|
|
68
|
-
# @param result [Hash] Extraction result to validate with the following structure:
|
|
69
|
-
# - "content" [String] - Extracted text content
|
|
70
|
-
# - "mime_type" [String] - MIME type of the source document
|
|
71
|
-
# - "metadata" [Hash] - Document metadata (title, author, etc.)
|
|
72
|
-
# - "tables" [Array<Hash>] - Extracted tables
|
|
73
|
-
# - "detected_languages" [Array<String>, nil] - Detected language codes
|
|
74
|
-
# - "chunks" [Array<String>, nil] - Content chunks (if chunking enabled)
|
|
75
|
-
#
|
|
76
|
-
# @return [void]
|
|
77
|
-
# @raise [Kreuzberg::Errors::ValidationError] if validation fails
|
|
78
|
-
#
|
|
79
|
-
# @example
|
|
80
|
-
# def call(result)
|
|
81
|
-
# if result["content"].length < 10
|
|
82
|
-
# raise Kreuzberg::Errors::ValidationError.new("Content too short")
|
|
83
|
-
# end
|
|
84
|
-
# end
|
|
85
|
-
def call(result)
|
|
86
|
-
raise NotImplementedError, "#{self.class} must implement #call(result)"
|
|
87
|
-
end
|
|
88
|
-
end
|
|
89
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# Validator protocol interface.
|
|
5
|
+
#
|
|
6
|
+
# This module defines the protocol that all Ruby validators must implement
|
|
7
|
+
# to be registered with the Rust core via the FFI bridge.
|
|
8
|
+
#
|
|
9
|
+
# Validators are called during extraction to validate results. If validation fails,
|
|
10
|
+
# the validator should raise a Kreuzberg::Errors::ValidationError, which will
|
|
11
|
+
# cause the extraction to fail.
|
|
12
|
+
#
|
|
13
|
+
# @example Implementing a minimum length validator
|
|
14
|
+
# class MinimumLengthValidator
|
|
15
|
+
# include Kreuzberg::ValidatorProtocol
|
|
16
|
+
#
|
|
17
|
+
# def initialize(min_length = 10)
|
|
18
|
+
# @min_length = min_length
|
|
19
|
+
# end
|
|
20
|
+
#
|
|
21
|
+
# def call(result)
|
|
22
|
+
# if result["content"].length < @min_length
|
|
23
|
+
# raise Kreuzberg::Errors::ValidationError.new(
|
|
24
|
+
# "Content too short: #{result["content"].length} < #{@min_length}"
|
|
25
|
+
# )
|
|
26
|
+
# end
|
|
27
|
+
# end
|
|
28
|
+
# end
|
|
29
|
+
#
|
|
30
|
+
# Kreuzberg.register_validator("min_length", MinimumLengthValidator.new(100))
|
|
31
|
+
#
|
|
32
|
+
# @example Implementing a content quality validator
|
|
33
|
+
# class QualityValidator
|
|
34
|
+
# include Kreuzberg::ValidatorProtocol
|
|
35
|
+
#
|
|
36
|
+
# def call(result)
|
|
37
|
+
# # Check if content has sufficient quality
|
|
38
|
+
# if result["content"].strip.empty?
|
|
39
|
+
# raise Kreuzberg::Errors::ValidationError.new("Empty content extracted")
|
|
40
|
+
# end
|
|
41
|
+
#
|
|
42
|
+
# # Check if metadata is present
|
|
43
|
+
# if result["metadata"].empty?
|
|
44
|
+
# raise Kreuzberg::Errors::ValidationError.new("No metadata extracted")
|
|
45
|
+
# end
|
|
46
|
+
# end
|
|
47
|
+
# end
|
|
48
|
+
#
|
|
49
|
+
# Kreuzberg.register_validator("quality", QualityValidator.new)
|
|
50
|
+
#
|
|
51
|
+
# @example Using a Proc as a validator
|
|
52
|
+
# Kreuzberg.register_validator("not_empty", ->(result) {
|
|
53
|
+
# if result["content"].strip.empty?
|
|
54
|
+
# raise Kreuzberg::Errors::ValidationError.new("Content cannot be empty")
|
|
55
|
+
# end
|
|
56
|
+
# })
|
|
57
|
+
#
|
|
58
|
+
module ValidatorProtocol
|
|
59
|
+
# Validate an extraction result.
|
|
60
|
+
#
|
|
61
|
+
# This method is called during extraction to validate results. If validation fails,
|
|
62
|
+
# raise a Kreuzberg::Errors::ValidationError with a descriptive message explaining
|
|
63
|
+
# why validation failed. If validation passes, return without raising.
|
|
64
|
+
#
|
|
65
|
+
# The validator receives the extraction result as a hash with the same structure
|
|
66
|
+
# as post-processors (see PostProcessorProtocol for details).
|
|
67
|
+
#
|
|
68
|
+
# @param result [Hash] Extraction result to validate with the following structure:
|
|
69
|
+
# - "content" [String] - Extracted text content
|
|
70
|
+
# - "mime_type" [String] - MIME type of the source document
|
|
71
|
+
# - "metadata" [Hash] - Document metadata (title, author, etc.)
|
|
72
|
+
# - "tables" [Array<Hash>] - Extracted tables
|
|
73
|
+
# - "detected_languages" [Array<String>, nil] - Detected language codes
|
|
74
|
+
# - "chunks" [Array<String>, nil] - Content chunks (if chunking enabled)
|
|
75
|
+
#
|
|
76
|
+
# @return [void]
|
|
77
|
+
# @raise [Kreuzberg::Errors::ValidationError] if validation fails
|
|
78
|
+
#
|
|
79
|
+
# @example
|
|
80
|
+
# def call(result)
|
|
81
|
+
# if result["content"].length < 10
|
|
82
|
+
# raise Kreuzberg::Errors::ValidationError.new("Content too short")
|
|
83
|
+
# end
|
|
84
|
+
# end
|
|
85
|
+
def call(result)
|
|
86
|
+
raise NotImplementedError, "#{self.class} must implement #call(result)"
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
data/lib/kreuzberg/version.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Kreuzberg
|
|
4
|
-
VERSION = '4.0.0-rc.
|
|
5
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
VERSION = '4.0.0-rc.14'
|
|
5
|
+
end
|
data/lib/kreuzberg.rb
CHANGED
|
@@ -1,109 +1,109 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative 'kreuzberg/setup_lib_path'
|
|
4
|
-
Kreuzberg::SetupLibPath.configure
|
|
5
|
-
|
|
6
|
-
require_relative 'kreuzberg/version'
|
|
7
|
-
require 'kreuzberg_rb'
|
|
8
|
-
|
|
9
|
-
# Kreuzberg is a Ruby binding for the Rust core library providing document extraction,
|
|
10
|
-
# text extraction, and OCR capabilities.
|
|
11
|
-
module Kreuzberg
|
|
12
|
-
autoload :Config, 'kreuzberg/config'
|
|
13
|
-
autoload :Result, 'kreuzberg/result'
|
|
14
|
-
autoload :CLI, 'kreuzberg/cli'
|
|
15
|
-
autoload :CLIProxy, 'kreuzberg/cli_proxy'
|
|
16
|
-
autoload :APIProxy, 'kreuzberg/api_proxy'
|
|
17
|
-
autoload :MCPProxy, 'kreuzberg/mcp_proxy'
|
|
18
|
-
autoload :Errors, 'kreuzberg/errors'
|
|
19
|
-
autoload :ErrorContext, 'kreuzberg/error_context'
|
|
20
|
-
autoload :PostProcessorProtocol, 'kreuzberg/post_processor_protocol'
|
|
21
|
-
autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
|
|
22
|
-
autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
|
|
23
|
-
|
|
24
|
-
# Alias for API consistency with other language bindings
|
|
25
|
-
ExtractionConfig = Config::Extraction
|
|
26
|
-
PageConfig = Config::PageConfig
|
|
27
|
-
|
|
28
|
-
module KeywordAlgorithm
|
|
29
|
-
YAKE = :yake
|
|
30
|
-
RAKE = :rake
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
@__cache_tracker = { entries: 0, bytes: 0 }
|
|
34
|
-
|
|
35
|
-
class << self
|
|
36
|
-
# Store native methods as private methods
|
|
37
|
-
alias native_extract_file_sync extract_file_sync
|
|
38
|
-
alias native_extract_bytes_sync extract_bytes_sync
|
|
39
|
-
alias native_batch_extract_files_sync batch_extract_files_sync
|
|
40
|
-
alias native_extract_file extract_file
|
|
41
|
-
alias native_extract_bytes extract_bytes
|
|
42
|
-
alias native_batch_extract_files batch_extract_files
|
|
43
|
-
alias native_batch_extract_bytes_sync batch_extract_bytes_sync
|
|
44
|
-
alias native_batch_extract_bytes batch_extract_bytes
|
|
45
|
-
alias native_clear_cache clear_cache
|
|
46
|
-
alias native_cache_stats cache_stats
|
|
47
|
-
|
|
48
|
-
private :native_extract_file_sync, :native_extract_bytes_sync, :native_batch_extract_files_sync
|
|
49
|
-
private :native_extract_file, :native_extract_bytes, :native_batch_extract_files
|
|
50
|
-
private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
# Register a Ruby post-processor that conforms to PostProcessorProtocol.
|
|
54
|
-
module_function :register_post_processor
|
|
55
|
-
|
|
56
|
-
# Remove a post-processor by name.
|
|
57
|
-
module_function :unregister_post_processor
|
|
58
|
-
|
|
59
|
-
# Purge all registered post-processors.
|
|
60
|
-
module_function :clear_post_processors
|
|
61
|
-
|
|
62
|
-
# Register a validator that follows ValidatorProtocol.
|
|
63
|
-
module_function :register_validator
|
|
64
|
-
|
|
65
|
-
# Remove a validator by name.
|
|
66
|
-
module_function :unregister_validator
|
|
67
|
-
|
|
68
|
-
# Purge all validators.
|
|
69
|
-
module_function :clear_validators
|
|
70
|
-
|
|
71
|
-
# List all registered validators.
|
|
72
|
-
module_function :list_validators
|
|
73
|
-
|
|
74
|
-
# List all registered post-processors.
|
|
75
|
-
module_function :list_post_processors
|
|
76
|
-
|
|
77
|
-
# Register an OCR backend instance implementing OcrBackendProtocol.
|
|
78
|
-
module_function :register_ocr_backend
|
|
79
|
-
|
|
80
|
-
# Unregister an OCR backend by name.
|
|
81
|
-
module_function :unregister_ocr_backend
|
|
82
|
-
|
|
83
|
-
# List all registered OCR backends.
|
|
84
|
-
module_function :list_ocr_backends
|
|
85
|
-
|
|
86
|
-
# Detect MIME type from file bytes.
|
|
87
|
-
module_function :detect_mime_type
|
|
88
|
-
|
|
89
|
-
# Detect MIME type from a file path.
|
|
90
|
-
module_function :detect_mime_type_from_path
|
|
91
|
-
|
|
92
|
-
# Validate a MIME type string.
|
|
93
|
-
module_function :validate_mime_type
|
|
94
|
-
|
|
95
|
-
# Get file extensions for a given MIME type.
|
|
96
|
-
module_function :get_extensions_for_mime
|
|
97
|
-
|
|
98
|
-
# List all available embedding presets.
|
|
99
|
-
module_function :list_embedding_presets
|
|
100
|
-
|
|
101
|
-
# Get a specific embedding preset by name.
|
|
102
|
-
module_function :get_embedding_preset
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
require_relative 'kreuzberg/cache_api'
|
|
106
|
-
require_relative 'kreuzberg/extraction_api'
|
|
107
|
-
|
|
108
|
-
Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
|
|
109
|
-
Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'kreuzberg/setup_lib_path'
|
|
4
|
+
Kreuzberg::SetupLibPath.configure
|
|
5
|
+
|
|
6
|
+
require_relative 'kreuzberg/version'
|
|
7
|
+
require 'kreuzberg_rb'
|
|
8
|
+
|
|
9
|
+
# Kreuzberg is a Ruby binding for the Rust core library providing document extraction,
|
|
10
|
+
# text extraction, and OCR capabilities.
|
|
11
|
+
module Kreuzberg
|
|
12
|
+
autoload :Config, 'kreuzberg/config'
|
|
13
|
+
autoload :Result, 'kreuzberg/result'
|
|
14
|
+
autoload :CLI, 'kreuzberg/cli'
|
|
15
|
+
autoload :CLIProxy, 'kreuzberg/cli_proxy'
|
|
16
|
+
autoload :APIProxy, 'kreuzberg/api_proxy'
|
|
17
|
+
autoload :MCPProxy, 'kreuzberg/mcp_proxy'
|
|
18
|
+
autoload :Errors, 'kreuzberg/errors'
|
|
19
|
+
autoload :ErrorContext, 'kreuzberg/error_context'
|
|
20
|
+
autoload :PostProcessorProtocol, 'kreuzberg/post_processor_protocol'
|
|
21
|
+
autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
|
|
22
|
+
autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
|
|
23
|
+
|
|
24
|
+
# Alias for API consistency with other language bindings
|
|
25
|
+
ExtractionConfig = Config::Extraction
|
|
26
|
+
PageConfig = Config::PageConfig
|
|
27
|
+
|
|
28
|
+
module KeywordAlgorithm
|
|
29
|
+
YAKE = :yake
|
|
30
|
+
RAKE = :rake
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
@__cache_tracker = { entries: 0, bytes: 0 }
|
|
34
|
+
|
|
35
|
+
class << self
|
|
36
|
+
# Store native methods as private methods
|
|
37
|
+
alias native_extract_file_sync extract_file_sync
|
|
38
|
+
alias native_extract_bytes_sync extract_bytes_sync
|
|
39
|
+
alias native_batch_extract_files_sync batch_extract_files_sync
|
|
40
|
+
alias native_extract_file extract_file
|
|
41
|
+
alias native_extract_bytes extract_bytes
|
|
42
|
+
alias native_batch_extract_files batch_extract_files
|
|
43
|
+
alias native_batch_extract_bytes_sync batch_extract_bytes_sync
|
|
44
|
+
alias native_batch_extract_bytes batch_extract_bytes
|
|
45
|
+
alias native_clear_cache clear_cache
|
|
46
|
+
alias native_cache_stats cache_stats
|
|
47
|
+
|
|
48
|
+
private :native_extract_file_sync, :native_extract_bytes_sync, :native_batch_extract_files_sync
|
|
49
|
+
private :native_extract_file, :native_extract_bytes, :native_batch_extract_files
|
|
50
|
+
private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Register a Ruby post-processor that conforms to PostProcessorProtocol.
|
|
54
|
+
module_function :register_post_processor
|
|
55
|
+
|
|
56
|
+
# Remove a post-processor by name.
|
|
57
|
+
module_function :unregister_post_processor
|
|
58
|
+
|
|
59
|
+
# Purge all registered post-processors.
|
|
60
|
+
module_function :clear_post_processors
|
|
61
|
+
|
|
62
|
+
# Register a validator that follows ValidatorProtocol.
|
|
63
|
+
module_function :register_validator
|
|
64
|
+
|
|
65
|
+
# Remove a validator by name.
|
|
66
|
+
module_function :unregister_validator
|
|
67
|
+
|
|
68
|
+
# Purge all validators.
|
|
69
|
+
module_function :clear_validators
|
|
70
|
+
|
|
71
|
+
# List all registered validators.
|
|
72
|
+
module_function :list_validators
|
|
73
|
+
|
|
74
|
+
# List all registered post-processors.
|
|
75
|
+
module_function :list_post_processors
|
|
76
|
+
|
|
77
|
+
# Register an OCR backend instance implementing OcrBackendProtocol.
|
|
78
|
+
module_function :register_ocr_backend
|
|
79
|
+
|
|
80
|
+
# Unregister an OCR backend by name.
|
|
81
|
+
module_function :unregister_ocr_backend
|
|
82
|
+
|
|
83
|
+
# List all registered OCR backends.
|
|
84
|
+
module_function :list_ocr_backends
|
|
85
|
+
|
|
86
|
+
# Detect MIME type from file bytes.
|
|
87
|
+
module_function :detect_mime_type
|
|
88
|
+
|
|
89
|
+
# Detect MIME type from a file path.
|
|
90
|
+
module_function :detect_mime_type_from_path
|
|
91
|
+
|
|
92
|
+
# Validate a MIME type string.
|
|
93
|
+
module_function :validate_mime_type
|
|
94
|
+
|
|
95
|
+
# Get file extensions for a given MIME type.
|
|
96
|
+
module_function :get_extensions_for_mime
|
|
97
|
+
|
|
98
|
+
# List all available embedding presets.
|
|
99
|
+
module_function :list_embedding_presets
|
|
100
|
+
|
|
101
|
+
# Get a specific embedding preset by name.
|
|
102
|
+
module_function :get_embedding_preset
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
require_relative 'kreuzberg/cache_api'
|
|
106
|
+
require_relative 'kreuzberg/extraction_api'
|
|
107
|
+
|
|
108
|
+
Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
|
|
109
|
+
Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)
|
|
Binary file
|