kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
data/lib/kreuzberg/mcp_proxy.rb
CHANGED
|
@@ -1,186 +1,186 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'open3'
|
|
4
|
-
require 'pathname'
|
|
5
|
-
require 'json'
|
|
6
|
-
|
|
7
|
-
module Kreuzberg
|
|
8
|
-
# MCP (Model Context Protocol) server proxy
|
|
9
|
-
#
|
|
10
|
-
# Starts and manages the Kreuzberg MCP server for Claude Desktop integration.
|
|
11
|
-
#
|
|
12
|
-
# @example Start MCP server
|
|
13
|
-
# server = Kreuzberg::MCPProxy.new
|
|
14
|
-
# server.start
|
|
15
|
-
#
|
|
16
|
-
module MCPProxy
|
|
17
|
-
Error = Class.new(Kreuzberg::Errors::Error)
|
|
18
|
-
MissingBinaryError = Class.new(Error)
|
|
19
|
-
ServerError = Class.new(Error)
|
|
20
|
-
|
|
21
|
-
# MCP server instance
|
|
22
|
-
class Server
|
|
23
|
-
attr_reader :pid, :transport
|
|
24
|
-
|
|
25
|
-
# Initialize MCP server
|
|
26
|
-
#
|
|
27
|
-
# @param transport [String] Transport method ("stdio" or "sse")
|
|
28
|
-
#
|
|
29
|
-
def initialize(transport: 'stdio')
|
|
30
|
-
@transport = transport
|
|
31
|
-
@pid = nil
|
|
32
|
-
@stdin = nil
|
|
33
|
-
@stdout = nil
|
|
34
|
-
@stderr = nil
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# Start the MCP server
|
|
38
|
-
#
|
|
39
|
-
# @return [Integer, nil] Process ID (for SSE) or nil (for stdio)
|
|
40
|
-
#
|
|
41
|
-
def start
|
|
42
|
-
binary = MCPProxy.find_mcp_binary
|
|
43
|
-
|
|
44
|
-
case @transport
|
|
45
|
-
when 'stdio'
|
|
46
|
-
start_stdio(binary)
|
|
47
|
-
when 'sse'
|
|
48
|
-
start_sse(binary)
|
|
49
|
-
else
|
|
50
|
-
raise ServerError, "Unknown transport: #{@transport}"
|
|
51
|
-
end
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
# Stop the server
|
|
55
|
-
#
|
|
56
|
-
# @return [void]
|
|
57
|
-
#
|
|
58
|
-
def stop
|
|
59
|
-
return unless @pid
|
|
60
|
-
|
|
61
|
-
Process.kill('TERM', @pid)
|
|
62
|
-
Process.wait(@pid)
|
|
63
|
-
rescue Errno::ESRCH, Errno::ECHILD
|
|
64
|
-
# Process already dead
|
|
65
|
-
ensure
|
|
66
|
-
@pid = nil
|
|
67
|
-
close_pipes
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
# Send a message to the server (stdio only)
|
|
71
|
-
#
|
|
72
|
-
# @param message [Hash] JSON-RPC message
|
|
73
|
-
# @return [void]
|
|
74
|
-
#
|
|
75
|
-
def send_message(message)
|
|
76
|
-
raise ServerError, 'Can only send messages in stdio mode' unless @transport == 'stdio'
|
|
77
|
-
raise ServerError, 'Server not started' unless @stdin
|
|
78
|
-
|
|
79
|
-
@stdin.puts(JSON.generate(message))
|
|
80
|
-
@stdin.flush
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
# Read a message from the server (stdio only)
|
|
84
|
-
#
|
|
85
|
-
# @return [Hash] JSON-RPC message
|
|
86
|
-
#
|
|
87
|
-
def read_message
|
|
88
|
-
raise ServerError, 'Can only read messages in stdio mode' unless @transport == 'stdio'
|
|
89
|
-
raise ServerError, 'Server not started' unless @stdout
|
|
90
|
-
|
|
91
|
-
line = @stdout.gets
|
|
92
|
-
JSON.parse(line) if line
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
# Check if server is running
|
|
96
|
-
#
|
|
97
|
-
# @return [Boolean]
|
|
98
|
-
#
|
|
99
|
-
def running?
|
|
100
|
-
return false unless @pid
|
|
101
|
-
|
|
102
|
-
Process.kill(0, @pid)
|
|
103
|
-
true
|
|
104
|
-
rescue Errno::ESRCH, Errno::EPERM
|
|
105
|
-
false
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
private
|
|
109
|
-
|
|
110
|
-
def start_stdio(binary)
|
|
111
|
-
@stdin, @stdout, @stderr, wait_thr = Open3.popen3(binary.to_s, 'mcp', '--transport', 'stdio')
|
|
112
|
-
@pid = wait_thr.pid
|
|
113
|
-
nil
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
def start_sse(binary)
|
|
117
|
-
@pid = spawn(
|
|
118
|
-
binary.to_s,
|
|
119
|
-
'mcp',
|
|
120
|
-
'--transport', 'sse',
|
|
121
|
-
out: $stdout,
|
|
122
|
-
err: $stderr
|
|
123
|
-
)
|
|
124
|
-
Process.detach(@pid)
|
|
125
|
-
sleep 1 # Give server time to start
|
|
126
|
-
@pid
|
|
127
|
-
end
|
|
128
|
-
|
|
129
|
-
def close_pipes
|
|
130
|
-
@stdin&.close
|
|
131
|
-
@stdout&.close
|
|
132
|
-
@stderr&.close
|
|
133
|
-
@stdin = @stdout = @stderr = nil
|
|
134
|
-
end
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
module_function
|
|
138
|
-
|
|
139
|
-
# Run MCP server with a block
|
|
140
|
-
#
|
|
141
|
-
# @param transport [String] Transport method
|
|
142
|
-
# @yield [Server] Yields server instance
|
|
143
|
-
# @return [Object] Block result
|
|
144
|
-
#
|
|
145
|
-
# @example
|
|
146
|
-
# Kreuzberg::MCPProxy.run(transport: 'stdio') do |server|
|
|
147
|
-
# server.send_message({ method: 'tools/list' })
|
|
148
|
-
# response = server.read_message
|
|
149
|
-
# end
|
|
150
|
-
#
|
|
151
|
-
def run(transport: 'stdio')
|
|
152
|
-
server = Server.new(transport: transport)
|
|
153
|
-
server.start
|
|
154
|
-
yield server
|
|
155
|
-
ensure
|
|
156
|
-
server&.stop
|
|
157
|
-
end
|
|
158
|
-
|
|
159
|
-
# Find the MCP binary
|
|
160
|
-
#
|
|
161
|
-
# @return [Pathname] Path to binary
|
|
162
|
-
# @raise [MissingBinaryError] If not found
|
|
163
|
-
#
|
|
164
|
-
def find_mcp_binary
|
|
165
|
-
# MCP is served by kreuzberg CLI
|
|
166
|
-
binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
|
|
167
|
-
found = CLIProxy.search_paths(binary_name).find(&:file?)
|
|
168
|
-
return found if found
|
|
169
|
-
|
|
170
|
-
raise MissingBinaryError, missing_binary_message
|
|
171
|
-
end
|
|
172
|
-
|
|
173
|
-
# Error message for missing binary
|
|
174
|
-
#
|
|
175
|
-
# @return [String]
|
|
176
|
-
#
|
|
177
|
-
def missing_binary_message
|
|
178
|
-
<<~MSG.strip
|
|
179
|
-
kreuzberg binary not found for MCP server. Build it with:
|
|
180
|
-
`cargo build --release --package kreuzberg-cli`
|
|
181
|
-
|
|
182
|
-
Or ensure kreuzberg is installed with MCP support.
|
|
183
|
-
MSG
|
|
184
|
-
end
|
|
185
|
-
end
|
|
186
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
require 'json'
|
|
6
|
+
|
|
7
|
+
module Kreuzberg
|
|
8
|
+
# MCP (Model Context Protocol) server proxy
|
|
9
|
+
#
|
|
10
|
+
# Starts and manages the Kreuzberg MCP server for Claude Desktop integration.
|
|
11
|
+
#
|
|
12
|
+
# @example Start MCP server
|
|
13
|
+
# server = Kreuzberg::MCPProxy.new
|
|
14
|
+
# server.start
|
|
15
|
+
#
|
|
16
|
+
module MCPProxy
|
|
17
|
+
Error = Class.new(Kreuzberg::Errors::Error)
|
|
18
|
+
MissingBinaryError = Class.new(Error)
|
|
19
|
+
ServerError = Class.new(Error)
|
|
20
|
+
|
|
21
|
+
# MCP server instance
|
|
22
|
+
class Server
|
|
23
|
+
attr_reader :pid, :transport
|
|
24
|
+
|
|
25
|
+
# Initialize MCP server
|
|
26
|
+
#
|
|
27
|
+
# @param transport [String] Transport method ("stdio" or "sse")
|
|
28
|
+
#
|
|
29
|
+
def initialize(transport: 'stdio')
|
|
30
|
+
@transport = transport
|
|
31
|
+
@pid = nil
|
|
32
|
+
@stdin = nil
|
|
33
|
+
@stdout = nil
|
|
34
|
+
@stderr = nil
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Start the MCP server
|
|
38
|
+
#
|
|
39
|
+
# @return [Integer, nil] Process ID (for SSE) or nil (for stdio)
|
|
40
|
+
#
|
|
41
|
+
def start
|
|
42
|
+
binary = MCPProxy.find_mcp_binary
|
|
43
|
+
|
|
44
|
+
case @transport
|
|
45
|
+
when 'stdio'
|
|
46
|
+
start_stdio(binary)
|
|
47
|
+
when 'sse'
|
|
48
|
+
start_sse(binary)
|
|
49
|
+
else
|
|
50
|
+
raise ServerError, "Unknown transport: #{@transport}"
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Stop the server
|
|
55
|
+
#
|
|
56
|
+
# @return [void]
|
|
57
|
+
#
|
|
58
|
+
def stop
|
|
59
|
+
return unless @pid
|
|
60
|
+
|
|
61
|
+
Process.kill('TERM', @pid)
|
|
62
|
+
Process.wait(@pid)
|
|
63
|
+
rescue Errno::ESRCH, Errno::ECHILD
|
|
64
|
+
# Process already dead
|
|
65
|
+
ensure
|
|
66
|
+
@pid = nil
|
|
67
|
+
close_pipes
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Send a message to the server (stdio only)
|
|
71
|
+
#
|
|
72
|
+
# @param message [Hash] JSON-RPC message
|
|
73
|
+
# @return [void]
|
|
74
|
+
#
|
|
75
|
+
def send_message(message)
|
|
76
|
+
raise ServerError, 'Can only send messages in stdio mode' unless @transport == 'stdio'
|
|
77
|
+
raise ServerError, 'Server not started' unless @stdin
|
|
78
|
+
|
|
79
|
+
@stdin.puts(JSON.generate(message))
|
|
80
|
+
@stdin.flush
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Read a message from the server (stdio only)
|
|
84
|
+
#
|
|
85
|
+
# @return [Hash] JSON-RPC message
|
|
86
|
+
#
|
|
87
|
+
def read_message
|
|
88
|
+
raise ServerError, 'Can only read messages in stdio mode' unless @transport == 'stdio'
|
|
89
|
+
raise ServerError, 'Server not started' unless @stdout
|
|
90
|
+
|
|
91
|
+
line = @stdout.gets
|
|
92
|
+
JSON.parse(line) if line
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Check if server is running
|
|
96
|
+
#
|
|
97
|
+
# @return [Boolean]
|
|
98
|
+
#
|
|
99
|
+
def running?
|
|
100
|
+
return false unless @pid
|
|
101
|
+
|
|
102
|
+
Process.kill(0, @pid)
|
|
103
|
+
true
|
|
104
|
+
rescue Errno::ESRCH, Errno::EPERM
|
|
105
|
+
false
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
private
|
|
109
|
+
|
|
110
|
+
def start_stdio(binary)
|
|
111
|
+
@stdin, @stdout, @stderr, wait_thr = Open3.popen3(binary.to_s, 'mcp', '--transport', 'stdio')
|
|
112
|
+
@pid = wait_thr.pid
|
|
113
|
+
nil
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def start_sse(binary)
|
|
117
|
+
@pid = spawn(
|
|
118
|
+
binary.to_s,
|
|
119
|
+
'mcp',
|
|
120
|
+
'--transport', 'sse',
|
|
121
|
+
out: $stdout,
|
|
122
|
+
err: $stderr
|
|
123
|
+
)
|
|
124
|
+
Process.detach(@pid)
|
|
125
|
+
sleep 1 # Give server time to start
|
|
126
|
+
@pid
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def close_pipes
|
|
130
|
+
@stdin&.close
|
|
131
|
+
@stdout&.close
|
|
132
|
+
@stderr&.close
|
|
133
|
+
@stdin = @stdout = @stderr = nil
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
module_function
|
|
138
|
+
|
|
139
|
+
# Run MCP server with a block
|
|
140
|
+
#
|
|
141
|
+
# @param transport [String] Transport method
|
|
142
|
+
# @yield [Server] Yields server instance
|
|
143
|
+
# @return [Object] Block result
|
|
144
|
+
#
|
|
145
|
+
# @example
|
|
146
|
+
# Kreuzberg::MCPProxy.run(transport: 'stdio') do |server|
|
|
147
|
+
# server.send_message({ method: 'tools/list' })
|
|
148
|
+
# response = server.read_message
|
|
149
|
+
# end
|
|
150
|
+
#
|
|
151
|
+
def run(transport: 'stdio')
|
|
152
|
+
server = Server.new(transport: transport)
|
|
153
|
+
server.start
|
|
154
|
+
yield server
|
|
155
|
+
ensure
|
|
156
|
+
server&.stop
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Find the MCP binary
|
|
160
|
+
#
|
|
161
|
+
# @return [Pathname] Path to binary
|
|
162
|
+
# @raise [MissingBinaryError] If not found
|
|
163
|
+
#
|
|
164
|
+
def find_mcp_binary
|
|
165
|
+
# MCP is served by kreuzberg CLI
|
|
166
|
+
binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
|
|
167
|
+
found = CLIProxy.search_paths(binary_name).find(&:file?)
|
|
168
|
+
return found if found
|
|
169
|
+
|
|
170
|
+
raise MissingBinaryError, missing_binary_message
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Error message for missing binary
|
|
174
|
+
#
|
|
175
|
+
# @return [String]
|
|
176
|
+
#
|
|
177
|
+
def missing_binary_message
|
|
178
|
+
<<~MSG.strip
|
|
179
|
+
kreuzberg binary not found for MCP server. Build it with:
|
|
180
|
+
`cargo build --release --package kreuzberg-cli`
|
|
181
|
+
|
|
182
|
+
Or ensure kreuzberg is installed with MCP support.
|
|
183
|
+
MSG
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
@@ -1,113 +1,113 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Kreuzberg
|
|
4
|
-
# OCR backend protocol interface.
|
|
5
|
-
#
|
|
6
|
-
# This module defines the protocol that all Ruby OCR backends must implement
|
|
7
|
-
# to be registered with the Rust core via the FFI bridge.
|
|
8
|
-
#
|
|
9
|
-
# OCR backends implement optical character recognition for images and scanned documents.
|
|
10
|
-
# They are called when OCR is enabled in the extraction configuration.
|
|
11
|
-
#
|
|
12
|
-
# @example Implementing a custom OCR backend
|
|
13
|
-
# class CustomOcrBackend
|
|
14
|
-
# include Kreuzberg::OcrBackendProtocol
|
|
15
|
-
#
|
|
16
|
-
# def name
|
|
17
|
-
# "custom-ocr"
|
|
18
|
-
# end
|
|
19
|
-
#
|
|
20
|
-
# def process_image(image_bytes, config)
|
|
21
|
-
# # Perform OCR on image_bytes
|
|
22
|
-
# # This is a placeholder - integrate with a real OCR engine
|
|
23
|
-
# text = my_ocr_engine.recognize(image_bytes, language: config["language"])
|
|
24
|
-
# text
|
|
25
|
-
# end
|
|
26
|
-
# end
|
|
27
|
-
#
|
|
28
|
-
# backend = CustomOcrBackend.new
|
|
29
|
-
# Kreuzberg.register_ocr_backend(backend.name, backend)
|
|
30
|
-
#
|
|
31
|
-
# # Use in extraction
|
|
32
|
-
# result = Kreuzberg.extract_file_sync(
|
|
33
|
-
# "scanned.pdf",
|
|
34
|
-
# config: { ocr: { backend: "custom-ocr", language: "eng" } }
|
|
35
|
-
# )
|
|
36
|
-
#
|
|
37
|
-
# @example Implementing an OCR backend with initialization
|
|
38
|
-
# class ModelBasedOcr
|
|
39
|
-
# include Kreuzberg::OcrBackendProtocol
|
|
40
|
-
#
|
|
41
|
-
# def initialize
|
|
42
|
-
# @model = nil
|
|
43
|
-
# end
|
|
44
|
-
#
|
|
45
|
-
# def name
|
|
46
|
-
# "model-ocr"
|
|
47
|
-
# end
|
|
48
|
-
#
|
|
49
|
-
# def process_image(image_bytes, config)
|
|
50
|
-
# # Load model on first use (lazy initialization)
|
|
51
|
-
# @model ||= load_model
|
|
52
|
-
#
|
|
53
|
-
# # Run OCR
|
|
54
|
-
# @model.recognize(image_bytes, config)
|
|
55
|
-
# end
|
|
56
|
-
#
|
|
57
|
-
# private
|
|
58
|
-
#
|
|
59
|
-
# def load_model
|
|
60
|
-
# # Load ML model for OCR
|
|
61
|
-
# MyOcrModel.load("path/to/model")
|
|
62
|
-
# end
|
|
63
|
-
# end
|
|
64
|
-
#
|
|
65
|
-
# Kreuzberg.register_ocr_backend("model-ocr", ModelBasedOcr.new)
|
|
66
|
-
#
|
|
67
|
-
module OcrBackendProtocol
|
|
68
|
-
# Return the unique name of this OCR backend.
|
|
69
|
-
#
|
|
70
|
-
# This name is used in ExtractionConfig to select the backend:
|
|
71
|
-
#
|
|
72
|
-
# config = { ocr: { backend: "custom-ocr", language: "eng" } }
|
|
73
|
-
#
|
|
74
|
-
# The name should be a lowercase string with hyphens (e.g., "custom-ocr", "tesseract").
|
|
75
|
-
#
|
|
76
|
-
# @return [String] Unique backend identifier
|
|
77
|
-
#
|
|
78
|
-
# @example
|
|
79
|
-
# def name
|
|
80
|
-
# "custom-ocr"
|
|
81
|
-
# end
|
|
82
|
-
def name
|
|
83
|
-
raise NotImplementedError, "#{self.class} must implement #name"
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
# Process image bytes and extract text via OCR.
|
|
87
|
-
#
|
|
88
|
-
# This method receives raw image data (PNG, JPEG, TIFF, etc.) and an OCR configuration
|
|
89
|
-
# hash. It must return the extracted text as a string.
|
|
90
|
-
#
|
|
91
|
-
# The config hash contains OCR settings such as:
|
|
92
|
-
# - "language" [String] - Language code (e.g., "eng", "deu", "fra")
|
|
93
|
-
# - "backend" [String] - Backend name (same as #name)
|
|
94
|
-
# - Additional backend-specific settings
|
|
95
|
-
#
|
|
96
|
-
# @param image_bytes [String] Binary image data (PNG, JPEG, TIFF, etc.)
|
|
97
|
-
# @param config [Hash] OCR configuration with the following keys:
|
|
98
|
-
# - "language" [String] - Language code for OCR (e.g., "eng", "deu")
|
|
99
|
-
# - "backend" [String] - Backend name
|
|
100
|
-
#
|
|
101
|
-
# @return [String] Extracted text content
|
|
102
|
-
#
|
|
103
|
-
# @example
|
|
104
|
-
# def process_image(image_bytes, config)
|
|
105
|
-
# language = config["language"] || "eng"
|
|
106
|
-
# text = my_ocr_engine.recognize(image_bytes, language: language)
|
|
107
|
-
# text
|
|
108
|
-
# end
|
|
109
|
-
def process_image(image_bytes, config)
|
|
110
|
-
raise NotImplementedError, "#{self.class} must implement #process_image(image_bytes, config)"
|
|
111
|
-
end
|
|
112
|
-
end
|
|
113
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# OCR backend protocol interface.
|
|
5
|
+
#
|
|
6
|
+
# This module defines the protocol that all Ruby OCR backends must implement
|
|
7
|
+
# to be registered with the Rust core via the FFI bridge.
|
|
8
|
+
#
|
|
9
|
+
# OCR backends implement optical character recognition for images and scanned documents.
|
|
10
|
+
# They are called when OCR is enabled in the extraction configuration.
|
|
11
|
+
#
|
|
12
|
+
# @example Implementing a custom OCR backend
|
|
13
|
+
# class CustomOcrBackend
|
|
14
|
+
# include Kreuzberg::OcrBackendProtocol
|
|
15
|
+
#
|
|
16
|
+
# def name
|
|
17
|
+
# "custom-ocr"
|
|
18
|
+
# end
|
|
19
|
+
#
|
|
20
|
+
# def process_image(image_bytes, config)
|
|
21
|
+
# # Perform OCR on image_bytes
|
|
22
|
+
# # This is a placeholder - integrate with a real OCR engine
|
|
23
|
+
# text = my_ocr_engine.recognize(image_bytes, language: config["language"])
|
|
24
|
+
# text
|
|
25
|
+
# end
|
|
26
|
+
# end
|
|
27
|
+
#
|
|
28
|
+
# backend = CustomOcrBackend.new
|
|
29
|
+
# Kreuzberg.register_ocr_backend(backend.name, backend)
|
|
30
|
+
#
|
|
31
|
+
# # Use in extraction
|
|
32
|
+
# result = Kreuzberg.extract_file_sync(
|
|
33
|
+
# "scanned.pdf",
|
|
34
|
+
# config: { ocr: { backend: "custom-ocr", language: "eng" } }
|
|
35
|
+
# )
|
|
36
|
+
#
|
|
37
|
+
# @example Implementing an OCR backend with initialization
|
|
38
|
+
# class ModelBasedOcr
|
|
39
|
+
# include Kreuzberg::OcrBackendProtocol
|
|
40
|
+
#
|
|
41
|
+
# def initialize
|
|
42
|
+
# @model = nil
|
|
43
|
+
# end
|
|
44
|
+
#
|
|
45
|
+
# def name
|
|
46
|
+
# "model-ocr"
|
|
47
|
+
# end
|
|
48
|
+
#
|
|
49
|
+
# def process_image(image_bytes, config)
|
|
50
|
+
# # Load model on first use (lazy initialization)
|
|
51
|
+
# @model ||= load_model
|
|
52
|
+
#
|
|
53
|
+
# # Run OCR
|
|
54
|
+
# @model.recognize(image_bytes, config)
|
|
55
|
+
# end
|
|
56
|
+
#
|
|
57
|
+
# private
|
|
58
|
+
#
|
|
59
|
+
# def load_model
|
|
60
|
+
# # Load ML model for OCR
|
|
61
|
+
# MyOcrModel.load("path/to/model")
|
|
62
|
+
# end
|
|
63
|
+
# end
|
|
64
|
+
#
|
|
65
|
+
# Kreuzberg.register_ocr_backend("model-ocr", ModelBasedOcr.new)
|
|
66
|
+
#
|
|
67
|
+
module OcrBackendProtocol
|
|
68
|
+
# Return the unique name of this OCR backend.
|
|
69
|
+
#
|
|
70
|
+
# This name is used in ExtractionConfig to select the backend:
|
|
71
|
+
#
|
|
72
|
+
# config = { ocr: { backend: "custom-ocr", language: "eng" } }
|
|
73
|
+
#
|
|
74
|
+
# The name should be a lowercase string with hyphens (e.g., "custom-ocr", "tesseract").
|
|
75
|
+
#
|
|
76
|
+
# @return [String] Unique backend identifier
|
|
77
|
+
#
|
|
78
|
+
# @example
|
|
79
|
+
# def name
|
|
80
|
+
# "custom-ocr"
|
|
81
|
+
# end
|
|
82
|
+
def name
|
|
83
|
+
raise NotImplementedError, "#{self.class} must implement #name"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Process image bytes and extract text via OCR.
|
|
87
|
+
#
|
|
88
|
+
# This method receives raw image data (PNG, JPEG, TIFF, etc.) and an OCR configuration
|
|
89
|
+
# hash. It must return the extracted text as a string.
|
|
90
|
+
#
|
|
91
|
+
# The config hash contains OCR settings such as:
|
|
92
|
+
# - "language" [String] - Language code (e.g., "eng", "deu", "fra")
|
|
93
|
+
# - "backend" [String] - Backend name (same as #name)
|
|
94
|
+
# - Additional backend-specific settings
|
|
95
|
+
#
|
|
96
|
+
# @param image_bytes [String] Binary image data (PNG, JPEG, TIFF, etc.)
|
|
97
|
+
# @param config [Hash] OCR configuration with the following keys:
|
|
98
|
+
# - "language" [String] - Language code for OCR (e.g., "eng", "deu")
|
|
99
|
+
# - "backend" [String] - Backend name
|
|
100
|
+
#
|
|
101
|
+
# @return [String] Extracted text content
|
|
102
|
+
#
|
|
103
|
+
# @example
|
|
104
|
+
# def process_image(image_bytes, config)
|
|
105
|
+
# language = config["language"] || "eng"
|
|
106
|
+
# text = my_ocr_engine.recognize(image_bytes, language: language)
|
|
107
|
+
# text
|
|
108
|
+
# end
|
|
109
|
+
def process_image(image_bytes, config)
|
|
110
|
+
raise NotImplementedError, "#{self.class} must implement #process_image(image_bytes, config)"
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|