kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
data/lib/kreuzberg/mcp_proxy.rb
CHANGED
|
@@ -1,186 +1,177 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'open3'
|
|
4
|
-
require 'pathname'
|
|
5
|
-
require 'json'
|
|
6
|
-
|
|
7
|
-
module Kreuzberg
|
|
8
|
-
#
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
#
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
@
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
#
|
|
140
|
-
#
|
|
141
|
-
#
|
|
142
|
-
#
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
<<~MSG.strip
|
|
179
|
-
kreuzberg binary not found for MCP server. Build it with:
|
|
180
|
-
`cargo build --release --package kreuzberg-cli`
|
|
181
|
-
|
|
182
|
-
Or ensure kreuzberg is installed with MCP support.
|
|
183
|
-
MSG
|
|
184
|
-
end
|
|
185
|
-
end
|
|
186
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
require 'json'
|
|
6
|
+
|
|
7
|
+
module Kreuzberg
|
|
8
|
+
# @example Start MCP server
|
|
9
|
+
module MCPProxy
|
|
10
|
+
Error = Class.new(Kreuzberg::Errors::Error)
|
|
11
|
+
MissingBinaryError = Class.new(Error)
|
|
12
|
+
ServerError = Class.new(Error)
|
|
13
|
+
|
|
14
|
+
# MCP server instance
|
|
15
|
+
class Server
|
|
16
|
+
attr_reader :pid, :transport
|
|
17
|
+
|
|
18
|
+
# Initialize MCP server
|
|
19
|
+
#
|
|
20
|
+
# @param transport [String] Transport method ("stdio" or "sse")
|
|
21
|
+
#
|
|
22
|
+
def initialize(transport: 'stdio')
|
|
23
|
+
@transport = transport
|
|
24
|
+
@pid = nil
|
|
25
|
+
@stdin = nil
|
|
26
|
+
@stdout = nil
|
|
27
|
+
@stderr = nil
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Start the MCP server
|
|
31
|
+
#
|
|
32
|
+
# @return [Integer, nil] Process ID (for SSE) or nil (for stdio)
|
|
33
|
+
#
|
|
34
|
+
def start
|
|
35
|
+
binary = MCPProxy.find_mcp_binary
|
|
36
|
+
|
|
37
|
+
case @transport
|
|
38
|
+
when 'stdio'
|
|
39
|
+
start_stdio(binary)
|
|
40
|
+
when 'sse'
|
|
41
|
+
start_sse(binary)
|
|
42
|
+
else
|
|
43
|
+
raise ServerError, "Unknown transport: #{@transport}"
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Stop the server
|
|
48
|
+
#
|
|
49
|
+
# @return [void]
|
|
50
|
+
#
|
|
51
|
+
def stop
|
|
52
|
+
return unless @pid
|
|
53
|
+
|
|
54
|
+
Process.kill('TERM', @pid)
|
|
55
|
+
Process.wait(@pid)
|
|
56
|
+
rescue Errno::ESRCH, Errno::ECHILD # rubocop:disable Lint/SuppressedException
|
|
57
|
+
ensure
|
|
58
|
+
@pid = nil
|
|
59
|
+
close_pipes
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Send a message to the server (stdio only)
|
|
63
|
+
#
|
|
64
|
+
# @param message [Hash] JSON-RPC message
|
|
65
|
+
# @return [void]
|
|
66
|
+
#
|
|
67
|
+
def send_message(message)
|
|
68
|
+
raise ServerError, 'Can only send messages in stdio mode' unless @transport == 'stdio'
|
|
69
|
+
raise ServerError, 'Server not started' unless @stdin
|
|
70
|
+
|
|
71
|
+
@stdin.puts(JSON.generate(message))
|
|
72
|
+
@stdin.flush
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Read a message from the server (stdio only)
|
|
76
|
+
#
|
|
77
|
+
# @return [Hash] JSON-RPC message
|
|
78
|
+
#
|
|
79
|
+
def read_message
|
|
80
|
+
raise ServerError, 'Can only read messages in stdio mode' unless @transport == 'stdio'
|
|
81
|
+
raise ServerError, 'Server not started' unless @stdout
|
|
82
|
+
|
|
83
|
+
line = @stdout.gets
|
|
84
|
+
JSON.parse(line) if line
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Check if server is running
|
|
88
|
+
#
|
|
89
|
+
# @return [Boolean]
|
|
90
|
+
#
|
|
91
|
+
def running?
|
|
92
|
+
return false unless @pid
|
|
93
|
+
|
|
94
|
+
Process.kill(0, @pid)
|
|
95
|
+
true
|
|
96
|
+
rescue Errno::ESRCH, Errno::EPERM
|
|
97
|
+
false
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
private
|
|
101
|
+
|
|
102
|
+
def start_stdio(binary)
|
|
103
|
+
@stdin, @stdout, @stderr, wait_thr = Open3.popen3(binary.to_s, 'mcp', '--transport', 'stdio')
|
|
104
|
+
@pid = wait_thr.pid
|
|
105
|
+
nil
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def start_sse(binary)
|
|
109
|
+
@pid = spawn(
|
|
110
|
+
binary.to_s,
|
|
111
|
+
'mcp',
|
|
112
|
+
'--transport', 'sse',
|
|
113
|
+
out: $stdout,
|
|
114
|
+
err: $stderr
|
|
115
|
+
)
|
|
116
|
+
Process.detach(@pid)
|
|
117
|
+
sleep 1
|
|
118
|
+
@pid
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def close_pipes
|
|
122
|
+
@stdin&.close
|
|
123
|
+
@stdout&.close
|
|
124
|
+
@stderr&.close
|
|
125
|
+
@stdin = @stdout = @stderr = nil
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
module_function
|
|
130
|
+
|
|
131
|
+
# Run MCP server with a block
|
|
132
|
+
#
|
|
133
|
+
# @param transport [String] Transport method
|
|
134
|
+
# @yield [Server] Yields server instance
|
|
135
|
+
# @return [Object] Block result
|
|
136
|
+
#
|
|
137
|
+
# @example
|
|
138
|
+
# Kreuzberg::MCPProxy.run(transport: 'stdio') do |server|
|
|
139
|
+
# server.send_message({ method: 'tools/list' })
|
|
140
|
+
# response = server.read_message
|
|
141
|
+
# end
|
|
142
|
+
#
|
|
143
|
+
def run(transport: 'stdio')
|
|
144
|
+
server = Server.new(transport: transport)
|
|
145
|
+
server.start
|
|
146
|
+
yield server
|
|
147
|
+
ensure
|
|
148
|
+
server&.stop
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Find the MCP binary
|
|
152
|
+
#
|
|
153
|
+
# @return [Pathname] Path to binary
|
|
154
|
+
# @raise [MissingBinaryError] If not found
|
|
155
|
+
#
|
|
156
|
+
def find_mcp_binary
|
|
157
|
+
binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
|
|
158
|
+
found = CLIProxy.search_paths(binary_name).find(&:file?)
|
|
159
|
+
return found if found
|
|
160
|
+
|
|
161
|
+
raise MissingBinaryError, missing_binary_message
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Error message for missing binary
|
|
165
|
+
#
|
|
166
|
+
# @return [String]
|
|
167
|
+
#
|
|
168
|
+
def missing_binary_message
|
|
169
|
+
<<~MSG.strip
|
|
170
|
+
kreuzberg binary not found for MCP server. Build it with:
|
|
171
|
+
`cargo build --release --package kreuzberg-cli`
|
|
172
|
+
|
|
173
|
+
Or ensure kreuzberg is installed with MCP support.
|
|
174
|
+
MSG
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
@@ -1,113 +1,40 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Kreuzberg
|
|
4
|
-
# OCR backend
|
|
5
|
-
#
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
# def initialize
|
|
42
|
-
# @model = nil
|
|
43
|
-
# end
|
|
44
|
-
#
|
|
45
|
-
# def name
|
|
46
|
-
# "model-ocr"
|
|
47
|
-
# end
|
|
48
|
-
#
|
|
49
|
-
# def process_image(image_bytes, config)
|
|
50
|
-
# # Load model on first use (lazy initialization)
|
|
51
|
-
# @model ||= load_model
|
|
52
|
-
#
|
|
53
|
-
# # Run OCR
|
|
54
|
-
# @model.recognize(image_bytes, config)
|
|
55
|
-
# end
|
|
56
|
-
#
|
|
57
|
-
# private
|
|
58
|
-
#
|
|
59
|
-
# def load_model
|
|
60
|
-
# # Load ML model for OCR
|
|
61
|
-
# MyOcrModel.load("path/to/model")
|
|
62
|
-
# end
|
|
63
|
-
# end
|
|
64
|
-
#
|
|
65
|
-
# Kreuzberg.register_ocr_backend("model-ocr", ModelBasedOcr.new)
|
|
66
|
-
#
|
|
67
|
-
module OcrBackendProtocol
|
|
68
|
-
# Return the unique name of this OCR backend.
|
|
69
|
-
#
|
|
70
|
-
# This name is used in ExtractionConfig to select the backend:
|
|
71
|
-
#
|
|
72
|
-
# config = { ocr: { backend: "custom-ocr", language: "eng" } }
|
|
73
|
-
#
|
|
74
|
-
# The name should be a lowercase string with hyphens (e.g., "custom-ocr", "tesseract").
|
|
75
|
-
#
|
|
76
|
-
# @return [String] Unique backend identifier
|
|
77
|
-
#
|
|
78
|
-
# @example
|
|
79
|
-
# def name
|
|
80
|
-
# "custom-ocr"
|
|
81
|
-
# end
|
|
82
|
-
def name
|
|
83
|
-
raise NotImplementedError, "#{self.class} must implement #name"
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
# Process image bytes and extract text via OCR.
|
|
87
|
-
#
|
|
88
|
-
# This method receives raw image data (PNG, JPEG, TIFF, etc.) and an OCR configuration
|
|
89
|
-
# hash. It must return the extracted text as a string.
|
|
90
|
-
#
|
|
91
|
-
# The config hash contains OCR settings such as:
|
|
92
|
-
# - "language" [String] - Language code (e.g., "eng", "deu", "fra")
|
|
93
|
-
# - "backend" [String] - Backend name (same as #name)
|
|
94
|
-
# - Additional backend-specific settings
|
|
95
|
-
#
|
|
96
|
-
# @param image_bytes [String] Binary image data (PNG, JPEG, TIFF, etc.)
|
|
97
|
-
# @param config [Hash] OCR configuration with the following keys:
|
|
98
|
-
# - "language" [String] - Language code for OCR (e.g., "eng", "deu")
|
|
99
|
-
# - "backend" [String] - Backend name
|
|
100
|
-
#
|
|
101
|
-
# @return [String] Extracted text content
|
|
102
|
-
#
|
|
103
|
-
# @example
|
|
104
|
-
# def process_image(image_bytes, config)
|
|
105
|
-
# language = config["language"] || "eng"
|
|
106
|
-
# text = my_ocr_engine.recognize(image_bytes, language: language)
|
|
107
|
-
# text
|
|
108
|
-
# end
|
|
109
|
-
def process_image(image_bytes, config)
|
|
110
|
-
raise NotImplementedError, "#{self.class} must implement #process_image(image_bytes, config)"
|
|
111
|
-
end
|
|
112
|
-
end
|
|
113
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# @example Implementing a custom OCR backend
|
|
5
|
+
# @example Implementing an OCR backend with initialization
|
|
6
|
+
module OcrBackendProtocol
|
|
7
|
+
# @return [String] Unique backend identifier
|
|
8
|
+
# @example
|
|
9
|
+
def name
|
|
10
|
+
raise NotImplementedError, "#{self.class} must implement #name"
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# Process image bytes and extract text via OCR.
|
|
14
|
+
#
|
|
15
|
+
# This method receives raw image data (PNG, JPEG, TIFF, etc.) and an OCR configuration
|
|
16
|
+
# hash. It must return the extracted text as a string.
|
|
17
|
+
#
|
|
18
|
+
# The config hash contains OCR settings such as:
|
|
19
|
+
# - "language" [String] - Language code (e.g., "eng", "deu", "fra")
|
|
20
|
+
# - "backend" [String] - Backend name (same as #name)
|
|
21
|
+
# - Additional backend-specific settings
|
|
22
|
+
#
|
|
23
|
+
# @param image_bytes [String] Binary image data (PNG, JPEG, TIFF, etc.)
|
|
24
|
+
# @param config [Hash] OCR configuration with the following keys:
|
|
25
|
+
# - "language" [String] - Language code for OCR (e.g., "eng", "deu")
|
|
26
|
+
# - "backend" [String] - Backend name
|
|
27
|
+
#
|
|
28
|
+
# @return [String] Extracted text content
|
|
29
|
+
#
|
|
30
|
+
# @example
|
|
31
|
+
# def process_image(image_bytes, config)
|
|
32
|
+
# language = config["language"] || "eng"
|
|
33
|
+
# text = my_ocr_engine.recognize(image_bytes, language: language)
|
|
34
|
+
# text
|
|
35
|
+
# end
|
|
36
|
+
def process_image(image_bytes, config)
|
|
37
|
+
raise NotImplementedError, "#{self.class} must implement #process_image(image_bytes, config)"
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -1,86 +1,15 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Kreuzberg
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
# def call(result)
|
|
17
|
-
# result["content"] = result["content"].upcase
|
|
18
|
-
# result
|
|
19
|
-
# end
|
|
20
|
-
# end
|
|
21
|
-
#
|
|
22
|
-
# Kreuzberg.register_post_processor("upcase", UpcaseProcessor.new)
|
|
23
|
-
#
|
|
24
|
-
# @example Implementing a post-processor that adds metadata
|
|
25
|
-
# class EntityExtractor
|
|
26
|
-
# include Kreuzberg::PostProcessorProtocol
|
|
27
|
-
#
|
|
28
|
-
# def call(result)
|
|
29
|
-
# entities = extract_entities(result["content"])
|
|
30
|
-
# result["metadata"]["entities"] = entities
|
|
31
|
-
# result
|
|
32
|
-
# end
|
|
33
|
-
#
|
|
34
|
-
# private
|
|
35
|
-
#
|
|
36
|
-
# def extract_entities(text)
|
|
37
|
-
# # Extract named entities from text
|
|
38
|
-
# # This is a placeholder - use a real NER library in production
|
|
39
|
-
# text.scan(/[A-Z][a-z]+(?:\s[A-Z][a-z]+)*/)
|
|
40
|
-
# end
|
|
41
|
-
# end
|
|
42
|
-
#
|
|
43
|
-
# Kreuzberg.register_post_processor("entities", EntityExtractor.new)
|
|
44
|
-
#
|
|
45
|
-
# @example Using a Proc as a post-processor
|
|
46
|
-
# Kreuzberg.register_post_processor("word_count", ->(result) {
|
|
47
|
-
# word_count = result["content"].split.length
|
|
48
|
-
# result["metadata"]["word_count"] = word_count
|
|
49
|
-
# result
|
|
50
|
-
# })
|
|
51
|
-
#
|
|
52
|
-
module PostProcessorProtocol
|
|
53
|
-
# Process and enrich an extraction result.
|
|
54
|
-
#
|
|
55
|
-
# This method is called after extraction completes. It receives the extraction result
|
|
56
|
-
# as a hash and must return the modified hash. The processor can:
|
|
57
|
-
# - Add new keys to result["metadata"]
|
|
58
|
-
# - Transform result["content"]
|
|
59
|
-
# - Add entries to result["tables"]
|
|
60
|
-
# - Modify any other result fields
|
|
61
|
-
#
|
|
62
|
-
# Existing metadata keys will not be overwritten by the FFI bridge, so it's safe
|
|
63
|
-
# to add new keys without worrying about conflicts.
|
|
64
|
-
#
|
|
65
|
-
# @param result [Hash] Extraction result with the following structure:
|
|
66
|
-
# - "content" [String] - Extracted text content
|
|
67
|
-
# - "mime_type" [String] - MIME type of the source document
|
|
68
|
-
# - "metadata" [Hash] - Document metadata (title, author, etc.)
|
|
69
|
-
# - "tables" [Array<Hash>] - Extracted tables
|
|
70
|
-
# - "detected_languages" [Array<String>, nil] - Detected language codes
|
|
71
|
-
# - "chunks" [Array<String>, nil] - Content chunks (if chunking enabled)
|
|
72
|
-
#
|
|
73
|
-
# @return [Hash] Modified extraction result with enriched metadata
|
|
74
|
-
#
|
|
75
|
-
# @example
|
|
76
|
-
# def call(result)
|
|
77
|
-
# text = result["content"]
|
|
78
|
-
# entities = extract_entities(text)
|
|
79
|
-
# result["metadata"]["entities"] = entities
|
|
80
|
-
# result
|
|
81
|
-
# end
|
|
82
|
-
def call(result)
|
|
83
|
-
raise NotImplementedError, "#{self.class} must implement #call(result)"
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# @example Implementing a simple post-processor
|
|
5
|
+
# @example Implementing a post-processor that adds metadata
|
|
6
|
+
# @example Using a Proc as a post-processor
|
|
7
|
+
module PostProcessorProtocol
|
|
8
|
+
# @param result [Hash] Extraction result with the following structure:
|
|
9
|
+
# @return [Hash] Modified extraction result with enriched metadata
|
|
10
|
+
# @example
|
|
11
|
+
def call(result)
|
|
12
|
+
raise NotImplementedError, "#{self.class} must implement #call(result)"
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|