kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +2 -105
- data/README.md +454 -454
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -1
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +7 -80
|
@@ -1,425 +1,425 @@
|
|
|
1
|
-
# kreuzberg-rb
|
|
2
|
-
|
|
3
|
-
Magnus bindings for the Kreuzberg document intelligence library.
|
|
4
|
-
|
|
5
|
-
## Overview
|
|
6
|
-
|
|
7
|
-
This crate provides Ruby bindings to the Rust core library (`crates/kreuzberg`) using Magnus. It exposes extraction functions, configuration types, and plugin registration APIs to Ruby.
|
|
8
|
-
|
|
9
|
-
## Architecture
|
|
10
|
-
|
|
11
|
-
### Binding Layers
|
|
12
|
-
|
|
13
|
-
```
|
|
14
|
-
Ruby Package (packages/ruby/)
|
|
15
|
-
↓
|
|
16
|
-
Magnus Bindings (packages/ruby/ext/kreuzberg_rb/native) ← This crate
|
|
17
|
-
↓
|
|
18
|
-
Rust Core (crates/kreuzberg)
|
|
19
|
-
```
|
|
20
|
-
|
|
21
|
-
### Key Components
|
|
22
|
-
|
|
23
|
-
- **Core API** (`src/lib.rs`): Extraction functions (sync & async variants)
|
|
24
|
-
- **Configuration Parsing**: Ruby Hash to Rust config conversion
|
|
25
|
-
- **Type Conversion**: Rust results to Ruby hashes
|
|
26
|
-
- **Plugin Bridges**: Ruby plugin registration (PostProcessor, Validator, OcrBackend)
|
|
27
|
-
- **Cache Management**: Cache utilities
|
|
28
|
-
|
|
29
|
-
## Async Runtime Implementation
|
|
30
|
-
|
|
31
|
-
### Current State: Limited Async Support
|
|
32
|
-
|
|
33
|
-
Unlike NAPI-RS (Node.js) and PyO3 (Python), Magnus **does not have a pyo3-async-runtimes equivalent**. Ruby bindings use a different async pattern:
|
|
34
|
-
|
|
35
|
-
#### Async Functions Use Tokio Runtime with GVL Blocking
|
|
36
|
-
|
|
37
|
-
**Implementation** (from `src/lib.rs:584-602`):
|
|
38
|
-
```rust
|
|
39
|
-
fn extract_file(args: &[Value]) -> Result<RHash, Error> {
|
|
40
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
41
|
-
let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
|
|
42
|
-
let (path,) = args.required;
|
|
43
|
-
let (mime_type,) = args.optional;
|
|
44
|
-
let opts = Some(args.keywords);
|
|
45
|
-
|
|
46
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
47
|
-
|
|
48
|
-
// Use Tokio runtime to block on async function
|
|
49
|
-
let runtime = tokio::runtime::Runtime::new()
|
|
50
|
-
.map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
|
|
51
|
-
|
|
52
|
-
let result = runtime
|
|
53
|
-
.block_on(async { kreuzberg::extract_file(&path, mime_type.as_deref(), &config).await })
|
|
54
|
-
.map_err(kreuzberg_error)?;
|
|
55
|
-
|
|
56
|
-
extraction_result_to_ruby(&ruby, result)
|
|
57
|
-
}
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
**What This Means**:
|
|
61
|
-
- ✅ **Works correctly** - Executes async Rust code successfully
|
|
62
|
-
- ⚠️ **Blocks Ruby thread** - Ruby thread frozen during async operations
|
|
63
|
-
- ❌ **No concurrency** - No performance benefit over synchronous calls from Ruby's perspective
|
|
64
|
-
- ❌ **GVL held** - Global VM Lock held during entire async operation
|
|
65
|
-
|
|
66
|
-
**Ruby Usage**:
|
|
67
|
-
```ruby
|
|
68
|
-
# This looks like it might be async, but it blocks the Ruby thread
|
|
69
|
-
result = Kreuzberg.extract_file("document.pdf")
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
### Why Magnus Differs from PyO3
|
|
73
|
-
|
|
74
|
-
| Feature | Magnus (Ruby) | PyO3 (Python) | NAPI-RS (Node.js) |
|
|
75
|
-
|---------|---------------|---------------|-------------------|
|
|
76
|
-
| Async Method Support | ❌ No | ✅ Yes | ✅ Yes |
|
|
77
|
-
| Runtime Integration | Manual `block_on()` | `pyo3_async_runtimes` | Built-in |
|
|
78
|
-
| GVL/GIL Release | ❌ Not available | ✅ Automatic | ✅ N/A (no GIL) |
|
|
79
|
-
| Coroutine Detection | ❌ N/A | ✅ `__await__` check | ✅ N/A (Promises) |
|
|
80
|
-
| Performance Optimization | ❌ Not possible | ✅ ~28x overhead reduction | ✅ ~0ms overhead |
|
|
81
|
-
|
|
82
|
-
### GVL Management
|
|
83
|
-
|
|
84
|
-
**Current State** (from Magnus maintainer matsadler):
|
|
85
|
-
> "Ruby does have a function to release the GVL called `rb_thread_call_without_gvl`, but it's hard to use correctly and Magnus doesn't expose it yet."
|
|
86
|
-
|
|
87
|
-
**Implications**:
|
|
88
|
-
- Async Rust operations **block the Ruby GVL**
|
|
89
|
-
- No concurrent Ruby execution during async Rust calls
|
|
90
|
-
- Performance similar to synchronous operations from Ruby's perspective
|
|
91
|
-
|
|
92
|
-
**Future Enhancement**: Safe `rb_thread_call_without_gvl` integration could enable:
|
|
93
|
-
- True concurrent async operations
|
|
94
|
-
- GVL release during Rust async waits
|
|
95
|
-
- Performance improvements for I/O-bound operations
|
|
96
|
-
|
|
97
|
-
### Comparison: Sync vs Async Functions
|
|
98
|
-
|
|
99
|
-
Both approaches currently have **equivalent performance** from Ruby's perspective:
|
|
100
|
-
|
|
101
|
-
**Synchronous Function**:
|
|
102
|
-
```rust
|
|
103
|
-
fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
104
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
105
|
-
let result = kreuzberg::extract_file_sync(&path, mime_type.as_deref(), &config)
|
|
106
|
-
.map_err(kreuzberg_error)?;
|
|
107
|
-
extraction_result_to_ruby(&ruby, result)
|
|
108
|
-
}
|
|
109
|
-
```
|
|
110
|
-
|
|
111
|
-
**Asynchronous Function**:
|
|
112
|
-
```rust
|
|
113
|
-
fn extract_file(args: &[Value]) -> Result<RHash, Error> {
|
|
114
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
115
|
-
let runtime = tokio::runtime::Runtime::new()?;
|
|
116
|
-
let result = runtime
|
|
117
|
-
.block_on(async { kreuzberg::extract_file(&path, mime_type.as_deref(), &config).await })
|
|
118
|
-
.map_err(kreuzberg_error)?;
|
|
119
|
-
extraction_result_to_ruby(&ruby, result)
|
|
120
|
-
}
|
|
121
|
-
```
|
|
122
|
-
|
|
123
|
-
**Performance**: Both block Ruby thread for same duration. Use `_sync` variants for clarity.
|
|
124
|
-
|
|
125
|
-
## Plugin System
|
|
126
|
-
|
|
127
|
-
The bindings support Ruby-based plugins through the trait-based plugin system:
|
|
128
|
-
|
|
129
|
-
### Ruby PostProcessor Plugin
|
|
130
|
-
|
|
131
|
-
```ruby
|
|
132
|
-
Kreuzberg.register_post_processor("uppercase", ->(result) {
|
|
133
|
-
result[:content] = result[:content].upcase
|
|
134
|
-
result
|
|
135
|
-
}, 100)
|
|
136
|
-
```
|
|
137
|
-
|
|
138
|
-
**Implementation** (`src/lib.rs:831-939`):
|
|
139
|
-
- Wraps Ruby Proc in `RubyPostProcessor` struct
|
|
140
|
-
- Implements `PostProcessor` trait
|
|
141
|
-
- Marked `unsafe impl Send + Sync` (safe due to Ruby GVL)
|
|
142
|
-
- Converts Rust result → Ruby hash → calls Proc → converts back
|
|
143
|
-
|
|
144
|
-
### Ruby Validator Plugin
|
|
145
|
-
|
|
146
|
-
```ruby
|
|
147
|
-
Kreuzberg.register_validator("min_length", ->(result) {
|
|
148
|
-
raise "Content too short" if result[:content].length < 100
|
|
149
|
-
}, 100)
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
**Implementation** (`src/lib.rs:954-1047`):
|
|
153
|
-
- Wraps Ruby Proc in `RubyValidator` struct
|
|
154
|
-
- Implements `Validator` trait
|
|
155
|
-
- Validates extraction results
|
|
156
|
-
- Can raise Ruby exceptions for validation failures
|
|
157
|
-
|
|
158
|
-
### Ruby OCR Backend Plugin
|
|
159
|
-
|
|
160
|
-
```ruby
|
|
161
|
-
class CustomOcr
|
|
162
|
-
def process_image(image_bytes, language)
|
|
163
|
-
# Return extracted text
|
|
164
|
-
"Extracted text"
|
|
165
|
-
end
|
|
166
|
-
|
|
167
|
-
def supports_language?(lang)
|
|
168
|
-
%w[eng deu fra].include?(lang)
|
|
169
|
-
end
|
|
170
|
-
end
|
|
171
|
-
|
|
172
|
-
Kreuzberg.register_ocr_backend("custom", CustomOcr.new)
|
|
173
|
-
```
|
|
174
|
-
|
|
175
|
-
**Implementation** (`src/lib.rs:1070-1169`):
|
|
176
|
-
- Wraps Ruby object in `RubyOcrBackend` struct
|
|
177
|
-
- Implements `OcrBackend` trait
|
|
178
|
-
- Calls Ruby methods for OCR processing
|
|
179
|
-
- **Blocks GVL during OCR** (no async support)
|
|
180
|
-
|
|
181
|
-
**Note**: Ruby OCR backends will block the GVL during processing. For I/O-bound OCR operations, consider using Ruby threads or background jobs.
|
|
182
|
-
|
|
183
|
-
## Thread Safety
|
|
184
|
-
|
|
185
|
-
All Ruby plugin wrappers are marked `unsafe impl Send + Sync`:
|
|
186
|
-
|
|
187
|
-
```rust
|
|
188
|
-
// SAFETY: We mark this as Send+Sync because Ruby Global VM Lock (GVL)
|
|
189
|
-
// ensures thread safety. Magnus::Value is thread-safe under GVL.
|
|
190
|
-
struct RubyPostProcessor {
|
|
191
|
-
name: String,
|
|
192
|
-
processor: magnus::Value,
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
// SAFETY: Ruby operations are protected by the Global VM Lock
|
|
196
|
-
unsafe impl Send for RubyPostProcessor {}
|
|
197
|
-
unsafe impl Sync for RubyPostProcessor {}
|
|
198
|
-
```
|
|
199
|
-
|
|
200
|
-
**Justification**:
|
|
201
|
-
- Ruby's Global VM Lock ensures thread safety
|
|
202
|
-
- `magnus::Value` is thread-safe under GVL
|
|
203
|
-
- Rust async runtime can safely schedule Ruby callbacks
|
|
204
|
-
- GVL prevents concurrent Ruby execution
|
|
205
|
-
|
|
206
|
-
## Building
|
|
207
|
-
|
|
208
|
-
### Development Build
|
|
209
|
-
|
|
210
|
-
```bash
|
|
211
|
-
cd packages/ruby
|
|
212
|
-
bundle exec rake compile
|
|
213
|
-
```
|
|
214
|
-
|
|
215
|
-
### Testing
|
|
216
|
-
|
|
217
|
-
Run Ruby tests that exercise the bindings:
|
|
218
|
-
|
|
219
|
-
```bash
|
|
220
|
-
bundle exec rspec
|
|
221
|
-
```
|
|
222
|
-
|
|
223
|
-
## Features
|
|
224
|
-
|
|
225
|
-
### Default Features
|
|
226
|
-
|
|
227
|
-
- None currently
|
|
228
|
-
|
|
229
|
-
### Optional Features
|
|
230
|
-
|
|
231
|
-
All features are passed through from the `kreuzberg` crate via `features = ["full"]`.
|
|
232
|
-
|
|
233
|
-
## Dependencies
|
|
234
|
-
|
|
235
|
-
- `magnus` - Git dependency (specific rev: `f6db117`)
|
|
236
|
-
- `tokio = "1.48"` with `rt` and `macros` features
|
|
237
|
-
- `async-trait = "0.1"` for async trait methods
|
|
238
|
-
- `serde_json = "1.0"` for metadata serialization
|
|
239
|
-
|
|
240
|
-
## Key Files
|
|
241
|
-
|
|
242
|
-
- `src/lib.rs`: All bindings code (extraction API, config parsing, plugin registration)
|
|
243
|
-
- `build.rs`: Build script for Rust extension compilation
|
|
244
|
-
|
|
245
|
-
## References
|
|
246
|
-
|
|
247
|
-
- **Magnus Documentation**: https://docs.rs/magnus
|
|
248
|
-
- **Magnus GitHub**: https://github.com/matsadler/magnus
|
|
249
|
-
- **Kreuzberg Core**: `../kreuzberg/`
|
|
250
|
-
- **Ruby Package**: `../../packages/ruby/`
|
|
251
|
-
|
|
252
|
-
## Performance Considerations
|
|
253
|
-
|
|
254
|
-
### For Plugin Authors
|
|
255
|
-
|
|
256
|
-
1. **Sync is currently equivalent to async**: Both block the Ruby GVL
|
|
257
|
-
```ruby
|
|
258
|
-
# These have equivalent performance
|
|
259
|
-
result = Kreuzberg.extract_file_sync("document.pdf")
|
|
260
|
-
result = Kreuzberg.extract_file("document.pdf")
|
|
261
|
-
```
|
|
262
|
-
|
|
263
|
-
2. **Use Ruby threads for concurrency**:
|
|
264
|
-
```ruby
|
|
265
|
-
# Process multiple files concurrently
|
|
266
|
-
files = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
|
|
267
|
-
threads = files.map do |file|
|
|
268
|
-
Thread.new { Kreuzberg.extract_file_sync(file) }
|
|
269
|
-
end
|
|
270
|
-
results = threads.map(&:value)
|
|
271
|
-
```
|
|
272
|
-
|
|
273
|
-
3. **Batch API is more efficient**:
|
|
274
|
-
```ruby
|
|
275
|
-
# Prefer batch API for multiple files
|
|
276
|
-
results = Kreuzberg.batch_extract_files_sync(["doc1.pdf", "doc2.pdf"])
|
|
277
|
-
```
|
|
278
|
-
|
|
279
|
-
### For Contributors
|
|
280
|
-
|
|
281
|
-
1. **Prefer `_sync` variants** - Clearer intent, same performance
|
|
282
|
-
2. **Async functions exist** for API compatibility with other language bindings
|
|
283
|
-
3. **Do NOT use `spawn_blocking`** - GVL already blocks, no benefit
|
|
284
|
-
4. **Create new Runtime per call** - Safe under GVL, no overhead from reuse
|
|
285
|
-
5. **Monitor Magnus development** for `rb_thread_call_without_gvl` exposure
|
|
286
|
-
6. **Thread safety via GVL** - `unsafe impl Send + Sync` is safe for Ruby callbacks
|
|
287
|
-
|
|
288
|
-
## Comparison with Other Language Bindings
|
|
289
|
-
|
|
290
|
-
### Async Support Ranking
|
|
291
|
-
|
|
292
|
-
1. **NAPI-RS (TypeScript/Node.js)**: ⭐⭐⭐⭐⭐
|
|
293
|
-
- Built-in async support
|
|
294
|
-
- Zero configuration
|
|
295
|
-
- ~0ms overhead
|
|
296
|
-
- Natural Promise integration
|
|
297
|
-
|
|
298
|
-
2. **PyO3 (Python)**: ⭐⭐⭐⭐
|
|
299
|
-
- `pyo3_async_runtimes` library
|
|
300
|
-
- Automatic async detection
|
|
301
|
-
- ~0.17ms overhead (optimized)
|
|
302
|
-
- GIL release during await
|
|
303
|
-
|
|
304
|
-
3. **Magnus (Ruby)**: ⭐⭐
|
|
305
|
-
- Manual `block_on()` pattern
|
|
306
|
-
- GVL blocks during async operations
|
|
307
|
-
- Same overhead as sync
|
|
308
|
-
- Limited concurrency
|
|
309
|
-
|
|
310
|
-
### When to Use Ruby Bindings
|
|
311
|
-
|
|
312
|
-
**Ruby bindings are best for**:
|
|
313
|
-
- ✅ **Rails applications** (ActiveJob for background processing)
|
|
314
|
-
- ✅ **Ruby scripts** (existing Ruby codebases)
|
|
315
|
-
- ✅ **Simple extraction** (single-file processing)
|
|
316
|
-
- ✅ **Batch processing** (batch API handles concurrency)
|
|
317
|
-
|
|
318
|
-
**Consider other bindings for**:
|
|
319
|
-
- ❌ **High concurrency** (use Node.js/NAPI-RS instead)
|
|
320
|
-
- ❌ **Real-time processing** (use Node.js/NAPI-RS instead)
|
|
321
|
-
- ❌ **I/O-bound workloads** (use Python/PyO3 or Node.js/NAPI-RS)
|
|
322
|
-
|
|
323
|
-
## GVL Release with Lucchetto (Experimental)
|
|
324
|
-
|
|
325
|
-
### Discovery: lucchetto Crate
|
|
326
|
-
|
|
327
|
-
**Lucchetto** (v0.4.0) is a third-party crate that enables calling Rust functions without holding the GVL:
|
|
328
|
-
|
|
329
|
-
```rust
|
|
330
|
-
use lucchetto::without_gvl;
|
|
331
|
-
|
|
332
|
-
#[without_gvl]
|
|
333
|
-
fn process_document(path: String) -> String {
|
|
334
|
-
// GVL released during execution!
|
|
335
|
-
// Other Ruby threads can run concurrently
|
|
336
|
-
std::thread::sleep(std::time::Duration::from_secs(2));
|
|
337
|
-
format!("Processed: {}", path)
|
|
338
|
-
}
|
|
339
|
-
```
|
|
340
|
-
|
|
341
|
-
**How it works**:
|
|
342
|
-
- Uses `rb_thread_call_without_gvl` internally (the hard-to-use-correctly function)
|
|
343
|
-
- Provides `#[without_gvl]` attribute macro for safe GVL release
|
|
344
|
-
- Enforces safety via `GvlSafe` trait (similar to `Send` + `Sync`)
|
|
345
|
-
- Functions can only accept/return types implementing `GvlSafe`
|
|
346
|
-
|
|
347
|
-
**Dependencies**:
|
|
348
|
-
```toml
|
|
349
|
-
[dependencies]
|
|
350
|
-
lucchetto = "0.4.0"
|
|
351
|
-
lucchetto-macros = "0.2.0"
|
|
352
|
-
rb-sys = "0"
|
|
353
|
-
```
|
|
354
|
-
|
|
355
|
-
**Safety Model**:
|
|
356
|
-
- `GvlSafe` trait prevents accessing Ruby objects from GVL-free code
|
|
357
|
-
- Custom types can implement `GvlSafe` if they don't interact with Ruby VM
|
|
358
|
-
- Compile-time verification via trait bounds
|
|
359
|
-
|
|
360
|
-
**Limitations**:
|
|
361
|
-
- ⚠️ **Experimental**: Author notes potential memory bugs and unsafe code
|
|
362
|
-
- ⚠️ **0.4.0 version**: Early stage, API may change
|
|
363
|
-
- ⚠️ **Documentation**: 0% coverage, review source code before use
|
|
364
|
-
- ⚠️ **Cannot access Ruby objects** during GVL-free execution
|
|
365
|
-
|
|
366
|
-
**Potential Integration**:
|
|
367
|
-
|
|
368
|
-
```rust
|
|
369
|
-
use lucchetto::without_gvl;
|
|
370
|
-
|
|
371
|
-
// Long-running extraction that doesn't need Ruby access
|
|
372
|
-
#[without_gvl]
|
|
373
|
-
fn extract_large_pdf_internal(path: String) -> Result<String, String> {
|
|
374
|
-
// GVL released - other Ruby threads can run!
|
|
375
|
-
match kreuzberg::extract_file_sync(&path, None, &Default::default()) {
|
|
376
|
-
Ok(result) => Ok(result.content),
|
|
377
|
-
Err(e) => Err(e.to_string()),
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
// Ruby-facing wrapper
|
|
382
|
-
fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
383
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
384
|
-
let path: String = args.get(0).unwrap().try_convert()?;
|
|
385
|
-
|
|
386
|
-
// Call GVL-free function
|
|
387
|
-
let content = extract_large_pdf_internal(path)
|
|
388
|
-
.map_err(|e| runtime_error(e))?;
|
|
389
|
-
|
|
390
|
-
// Convert to Ruby (GVL held)
|
|
391
|
-
let hash = ruby.hash_new();
|
|
392
|
-
hash.aset(ruby.intern("content"), content)?;
|
|
393
|
-
Ok(hash)
|
|
394
|
-
}
|
|
395
|
-
```
|
|
396
|
-
|
|
397
|
-
**Performance Impact**:
|
|
398
|
-
- ✅ **Enables true concurrency**: Ruby threads can run during Rust operations
|
|
399
|
-
- ✅ **No GVL blocking**: Long operations don't freeze Ruby runtime
|
|
400
|
-
- ✅ **Thread-level parallelism**: Multiple Ruby threads can process different files
|
|
401
|
-
|
|
402
|
-
**Recommendation**:
|
|
403
|
-
- **Monitor lucchetto development** before production use
|
|
404
|
-
- **Test thoroughly** in development environment
|
|
405
|
-
- **Consider for CPU-bound operations** (PDF extraction, OCR, image processing)
|
|
406
|
-
- **Not recommended yet** for production due to experimental status
|
|
407
|
-
|
|
408
|
-
## Future Improvements
|
|
409
|
-
|
|
410
|
-
Potential areas for async enhancement:
|
|
411
|
-
|
|
412
|
-
1. **Lucchetto integration** - Evaluate for GVL-free extraction (experimental)
|
|
413
|
-
2. **Ruby Fiber integration** - Map Rust futures to Ruby Fibers
|
|
414
|
-
3. **Async OCR backends** - Non-blocking OCR processing with GVL release
|
|
415
|
-
4. **Streaming results** - Chunked extraction without blocking GVL
|
|
416
|
-
|
|
417
|
-
**Contributing**: If you're interested in improving async support, check:
|
|
418
|
-
- Lucchetto crate: https://github.com/Maaarcocr/lucchetto
|
|
419
|
-
- Magnus GitHub Issues: https://github.com/matsadler/magnus/issues
|
|
420
|
-
- `rb_thread_call_without_gvl` discussions
|
|
421
|
-
- Ruby Fiber-based async patterns
|
|
422
|
-
|
|
423
|
-
## Contributing
|
|
424
|
-
|
|
425
|
-
See the main Kreuzberg repository for contribution guidelines.
|
|
1
|
+
# kreuzberg-rb
|
|
2
|
+
|
|
3
|
+
Magnus bindings for the Kreuzberg document intelligence library.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
This crate provides Ruby bindings to the Rust core library (`crates/kreuzberg`) using Magnus. It exposes extraction functions, configuration types, and plugin registration APIs to Ruby.
|
|
8
|
+
|
|
9
|
+
## Architecture
|
|
10
|
+
|
|
11
|
+
### Binding Layers
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
Ruby Package (packages/ruby/)
|
|
15
|
+
↓
|
|
16
|
+
Magnus Bindings (packages/ruby/ext/kreuzberg_rb/native) ← This crate
|
|
17
|
+
↓
|
|
18
|
+
Rust Core (crates/kreuzberg)
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### Key Components
|
|
22
|
+
|
|
23
|
+
- **Core API** (`src/lib.rs`): Extraction functions (sync & async variants)
|
|
24
|
+
- **Configuration Parsing**: Ruby Hash to Rust config conversion
|
|
25
|
+
- **Type Conversion**: Rust results to Ruby hashes
|
|
26
|
+
- **Plugin Bridges**: Ruby plugin registration (PostProcessor, Validator, OcrBackend)
|
|
27
|
+
- **Cache Management**: Cache utilities
|
|
28
|
+
|
|
29
|
+
## Async Runtime Implementation
|
|
30
|
+
|
|
31
|
+
### Current State: Limited Async Support
|
|
32
|
+
|
|
33
|
+
Unlike NAPI-RS (Node.js) and PyO3 (Python), Magnus **does not have a pyo3-async-runtimes equivalent**. Ruby bindings use a different async pattern:
|
|
34
|
+
|
|
35
|
+
#### Async Functions Use Tokio Runtime with GVL Blocking
|
|
36
|
+
|
|
37
|
+
**Implementation** (from `src/lib.rs:584-602`):
|
|
38
|
+
```rust
|
|
39
|
+
fn extract_file(args: &[Value]) -> Result<RHash, Error> {
|
|
40
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
41
|
+
let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
|
|
42
|
+
let (path,) = args.required;
|
|
43
|
+
let (mime_type,) = args.optional;
|
|
44
|
+
let opts = Some(args.keywords);
|
|
45
|
+
|
|
46
|
+
let config = parse_extraction_config(&ruby, opts)?;
|
|
47
|
+
|
|
48
|
+
// Use Tokio runtime to block on async function
|
|
49
|
+
let runtime = tokio::runtime::Runtime::new()
|
|
50
|
+
.map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
|
|
51
|
+
|
|
52
|
+
let result = runtime
|
|
53
|
+
.block_on(async { kreuzberg::extract_file(&path, mime_type.as_deref(), &config).await })
|
|
54
|
+
.map_err(kreuzberg_error)?;
|
|
55
|
+
|
|
56
|
+
extraction_result_to_ruby(&ruby, result)
|
|
57
|
+
}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**What This Means**:
|
|
61
|
+
- ✅ **Works correctly** - Executes async Rust code successfully
|
|
62
|
+
- ⚠️ **Blocks Ruby thread** - Ruby thread frozen during async operations
|
|
63
|
+
- ❌ **No concurrency** - No performance benefit over synchronous calls from Ruby's perspective
|
|
64
|
+
- ❌ **GVL held** - Global VM Lock held during entire async operation
|
|
65
|
+
|
|
66
|
+
**Ruby Usage**:
|
|
67
|
+
```ruby
|
|
68
|
+
# This looks like it might be async, but it blocks the Ruby thread
|
|
69
|
+
result = Kreuzberg.extract_file("document.pdf")
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Why Magnus Differs from PyO3
|
|
73
|
+
|
|
74
|
+
| Feature | Magnus (Ruby) | PyO3 (Python) | NAPI-RS (Node.js) |
|
|
75
|
+
|---------|---------------|---------------|-------------------|
|
|
76
|
+
| Async Method Support | ❌ No | ✅ Yes | ✅ Yes |
|
|
77
|
+
| Runtime Integration | Manual `block_on()` | `pyo3_async_runtimes` | Built-in |
|
|
78
|
+
| GVL/GIL Release | ❌ Not available | ✅ Automatic | ✅ N/A (no GIL) |
|
|
79
|
+
| Coroutine Detection | ❌ N/A | ✅ `__await__` check | ✅ N/A (Promises) |
|
|
80
|
+
| Performance Optimization | ❌ Not possible | ✅ ~28x overhead reduction | ✅ ~0ms overhead |
|
|
81
|
+
|
|
82
|
+
### GVL Management
|
|
83
|
+
|
|
84
|
+
**Current State** (from Magnus maintainer matsadler):
|
|
85
|
+
> "Ruby does have a function to release the GVL called `rb_thread_call_without_gvl`, but it's hard to use correctly and Magnus doesn't expose it yet."
|
|
86
|
+
|
|
87
|
+
**Implications**:
|
|
88
|
+
- Async Rust operations **block the Ruby GVL**
|
|
89
|
+
- No concurrent Ruby execution during async Rust calls
|
|
90
|
+
- Performance similar to synchronous operations from Ruby's perspective
|
|
91
|
+
|
|
92
|
+
**Future Enhancement**: Safe `rb_thread_call_without_gvl` integration could enable:
|
|
93
|
+
- True concurrent async operations
|
|
94
|
+
- GVL release during Rust async waits
|
|
95
|
+
- Performance improvements for I/O-bound operations
|
|
96
|
+
|
|
97
|
+
### Comparison: Sync vs Async Functions
|
|
98
|
+
|
|
99
|
+
Both approaches currently have **equivalent performance** from Ruby's perspective:
|
|
100
|
+
|
|
101
|
+
**Synchronous Function**:
|
|
102
|
+
```rust
|
|
103
|
+
fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
104
|
+
let config = parse_extraction_config(&ruby, opts)?;
|
|
105
|
+
let result = kreuzberg::extract_file_sync(&path, mime_type.as_deref(), &config)
|
|
106
|
+
.map_err(kreuzberg_error)?;
|
|
107
|
+
extraction_result_to_ruby(&ruby, result)
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
**Asynchronous Function**:
|
|
112
|
+
```rust
|
|
113
|
+
fn extract_file(args: &[Value]) -> Result<RHash, Error> {
|
|
114
|
+
let config = parse_extraction_config(&ruby, opts)?;
|
|
115
|
+
let runtime = tokio::runtime::Runtime::new()?;
|
|
116
|
+
let result = runtime
|
|
117
|
+
.block_on(async { kreuzberg::extract_file(&path, mime_type.as_deref(), &config).await })
|
|
118
|
+
.map_err(kreuzberg_error)?;
|
|
119
|
+
extraction_result_to_ruby(&ruby, result)
|
|
120
|
+
}
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
**Performance**: Both block Ruby thread for same duration. Use `_sync` variants for clarity.
|
|
124
|
+
|
|
125
|
+
## Plugin System
|
|
126
|
+
|
|
127
|
+
The bindings support Ruby-based plugins through the trait-based plugin system:
|
|
128
|
+
|
|
129
|
+
### Ruby PostProcessor Plugin
|
|
130
|
+
|
|
131
|
+
```ruby
|
|
132
|
+
Kreuzberg.register_post_processor("uppercase", ->(result) {
|
|
133
|
+
result[:content] = result[:content].upcase
|
|
134
|
+
result
|
|
135
|
+
}, 100)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**Implementation** (`src/lib.rs:831-939`):
|
|
139
|
+
- Wraps Ruby Proc in `RubyPostProcessor` struct
|
|
140
|
+
- Implements `PostProcessor` trait
|
|
141
|
+
- Marked `unsafe impl Send + Sync` (safe due to Ruby GVL)
|
|
142
|
+
- Converts Rust result → Ruby hash → calls Proc → converts back
|
|
143
|
+
|
|
144
|
+
### Ruby Validator Plugin
|
|
145
|
+
|
|
146
|
+
```ruby
|
|
147
|
+
Kreuzberg.register_validator("min_length", ->(result) {
|
|
148
|
+
raise "Content too short" if result[:content].length < 100
|
|
149
|
+
}, 100)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
**Implementation** (`src/lib.rs:954-1047`):
|
|
153
|
+
- Wraps Ruby Proc in `RubyValidator` struct
|
|
154
|
+
- Implements `Validator` trait
|
|
155
|
+
- Validates extraction results
|
|
156
|
+
- Can raise Ruby exceptions for validation failures
|
|
157
|
+
|
|
158
|
+
### Ruby OCR Backend Plugin
|
|
159
|
+
|
|
160
|
+
```ruby
|
|
161
|
+
class CustomOcr
|
|
162
|
+
def process_image(image_bytes, language)
|
|
163
|
+
# Return extracted text
|
|
164
|
+
"Extracted text"
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def supports_language?(lang)
|
|
168
|
+
%w[eng deu fra].include?(lang)
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
Kreuzberg.register_ocr_backend("custom", CustomOcr.new)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Implementation** (`src/lib.rs:1070-1169`):
|
|
176
|
+
- Wraps Ruby object in `RubyOcrBackend` struct
|
|
177
|
+
- Implements `OcrBackend` trait
|
|
178
|
+
- Calls Ruby methods for OCR processing
|
|
179
|
+
- **Blocks GVL during OCR** (no async support)
|
|
180
|
+
|
|
181
|
+
**Note**: Ruby OCR backends will block the GVL during processing. For I/O-bound OCR operations, consider using Ruby threads or background jobs.
|
|
182
|
+
|
|
183
|
+
## Thread Safety
|
|
184
|
+
|
|
185
|
+
All Ruby plugin wrappers are marked `unsafe impl Send + Sync`:
|
|
186
|
+
|
|
187
|
+
```rust
|
|
188
|
+
// SAFETY: We mark this as Send+Sync because Ruby Global VM Lock (GVL)
|
|
189
|
+
// ensures thread safety. Magnus::Value is thread-safe under GVL.
|
|
190
|
+
struct RubyPostProcessor {
|
|
191
|
+
name: String,
|
|
192
|
+
processor: magnus::Value,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// SAFETY: Ruby operations are protected by the Global VM Lock
|
|
196
|
+
unsafe impl Send for RubyPostProcessor {}
|
|
197
|
+
unsafe impl Sync for RubyPostProcessor {}
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
**Justification**:
|
|
201
|
+
- Ruby's Global VM Lock ensures thread safety
|
|
202
|
+
- `magnus::Value` is thread-safe under GVL
|
|
203
|
+
- Rust async runtime can safely schedule Ruby callbacks
|
|
204
|
+
- GVL prevents concurrent Ruby execution
|
|
205
|
+
|
|
206
|
+
## Building
|
|
207
|
+
|
|
208
|
+
### Development Build
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
cd packages/ruby
|
|
212
|
+
bundle exec rake compile
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Testing
|
|
216
|
+
|
|
217
|
+
Run Ruby tests that exercise the bindings:
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
bundle exec rspec
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Features
|
|
224
|
+
|
|
225
|
+
### Default Features
|
|
226
|
+
|
|
227
|
+
- None currently
|
|
228
|
+
|
|
229
|
+
### Optional Features
|
|
230
|
+
|
|
231
|
+
All features are passed through from the `kreuzberg` crate via `features = ["full"]`.
|
|
232
|
+
|
|
233
|
+
## Dependencies
|
|
234
|
+
|
|
235
|
+
- `magnus` - Git dependency (specific rev: `f6db117`)
|
|
236
|
+
- `tokio = "1.48"` with `rt` and `macros` features
|
|
237
|
+
- `async-trait = "0.1"` for async trait methods
|
|
238
|
+
- `serde_json = "1.0"` for metadata serialization
|
|
239
|
+
|
|
240
|
+
## Key Files
|
|
241
|
+
|
|
242
|
+
- `src/lib.rs`: All bindings code (extraction API, config parsing, plugin registration)
|
|
243
|
+
- `build.rs`: Build script for Rust extension compilation
|
|
244
|
+
|
|
245
|
+
## References
|
|
246
|
+
|
|
247
|
+
- **Magnus Documentation**: https://docs.rs/magnus
|
|
248
|
+
- **Magnus GitHub**: https://github.com/matsadler/magnus
|
|
249
|
+
- **Kreuzberg Core**: `../kreuzberg/`
|
|
250
|
+
- **Ruby Package**: `../../packages/ruby/`
|
|
251
|
+
|
|
252
|
+
## Performance Considerations
|
|
253
|
+
|
|
254
|
+
### For Plugin Authors
|
|
255
|
+
|
|
256
|
+
1. **Sync is currently equivalent to async**: Both block the Ruby GVL
|
|
257
|
+
```ruby
|
|
258
|
+
# These have equivalent performance
|
|
259
|
+
result = Kreuzberg.extract_file_sync("document.pdf")
|
|
260
|
+
result = Kreuzberg.extract_file("document.pdf")
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
2. **Use Ruby threads for concurrency**:
|
|
264
|
+
```ruby
|
|
265
|
+
# Process multiple files concurrently
|
|
266
|
+
files = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
|
|
267
|
+
threads = files.map do |file|
|
|
268
|
+
Thread.new { Kreuzberg.extract_file_sync(file) }
|
|
269
|
+
end
|
|
270
|
+
results = threads.map(&:value)
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
3. **Batch API is more efficient**:
|
|
274
|
+
```ruby
|
|
275
|
+
# Prefer batch API for multiple files
|
|
276
|
+
results = Kreuzberg.batch_extract_files_sync(["doc1.pdf", "doc2.pdf"])
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
### For Contributors
|
|
280
|
+
|
|
281
|
+
1. **Prefer `_sync` variants** - Clearer intent, same performance
|
|
282
|
+
2. **Async functions exist** for API compatibility with other language bindings
|
|
283
|
+
3. **Do NOT use `spawn_blocking`** - GVL already blocks, no benefit
|
|
284
|
+
4. **Create new Runtime per call** - Safe under GVL, no overhead from reuse
|
|
285
|
+
5. **Monitor Magnus development** for `rb_thread_call_without_gvl` exposure
|
|
286
|
+
6. **Thread safety via GVL** - `unsafe impl Send + Sync` is safe for Ruby callbacks
|
|
287
|
+
|
|
288
|
+
## Comparison with Other Language Bindings
|
|
289
|
+
|
|
290
|
+
### Async Support Ranking
|
|
291
|
+
|
|
292
|
+
1. **NAPI-RS (TypeScript/Node.js)**: ⭐⭐⭐⭐⭐
|
|
293
|
+
- Built-in async support
|
|
294
|
+
- Zero configuration
|
|
295
|
+
- ~0ms overhead
|
|
296
|
+
- Natural Promise integration
|
|
297
|
+
|
|
298
|
+
2. **PyO3 (Python)**: ⭐⭐⭐⭐
|
|
299
|
+
- `pyo3_async_runtimes` library
|
|
300
|
+
- Automatic async detection
|
|
301
|
+
- ~0.17ms overhead (optimized)
|
|
302
|
+
- GIL release during await
|
|
303
|
+
|
|
304
|
+
3. **Magnus (Ruby)**: ⭐⭐
|
|
305
|
+
- Manual `block_on()` pattern
|
|
306
|
+
- GVL blocks during async operations
|
|
307
|
+
- Same overhead as sync
|
|
308
|
+
- Limited concurrency
|
|
309
|
+
|
|
310
|
+
### When to Use Ruby Bindings
|
|
311
|
+
|
|
312
|
+
**Ruby bindings are best for**:
|
|
313
|
+
- ✅ **Rails applications** (ActiveJob for background processing)
|
|
314
|
+
- ✅ **Ruby scripts** (existing Ruby codebases)
|
|
315
|
+
- ✅ **Simple extraction** (single-file processing)
|
|
316
|
+
- ✅ **Batch processing** (batch API handles concurrency)
|
|
317
|
+
|
|
318
|
+
**Consider other bindings for**:
|
|
319
|
+
- ❌ **High concurrency** (use Node.js/NAPI-RS instead)
|
|
320
|
+
- ❌ **Real-time processing** (use Node.js/NAPI-RS instead)
|
|
321
|
+
- ❌ **I/O-bound workloads** (use Python/PyO3 or Node.js/NAPI-RS)
|
|
322
|
+
|
|
323
|
+
## GVL Release with Lucchetto (Experimental)
|
|
324
|
+
|
|
325
|
+
### Discovery: lucchetto Crate
|
|
326
|
+
|
|
327
|
+
**Lucchetto** (v0.4.0) is a third-party crate that enables calling Rust functions without holding the GVL:
|
|
328
|
+
|
|
329
|
+
```rust
|
|
330
|
+
use lucchetto::without_gvl;
|
|
331
|
+
|
|
332
|
+
#[without_gvl]
|
|
333
|
+
fn process_document(path: String) -> String {
|
|
334
|
+
// GVL released during execution!
|
|
335
|
+
// Other Ruby threads can run concurrently
|
|
336
|
+
std::thread::sleep(std::time::Duration::from_secs(2));
|
|
337
|
+
format!("Processed: {}", path)
|
|
338
|
+
}
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
**How it works**:
|
|
342
|
+
- Uses `rb_thread_call_without_gvl` internally (the hard-to-use-correctly function)
|
|
343
|
+
- Provides `#[without_gvl]` attribute macro for safe GVL release
|
|
344
|
+
- Enforces safety via `GvlSafe` trait (similar to `Send` + `Sync`)
|
|
345
|
+
- Functions can only accept/return types implementing `GvlSafe`
|
|
346
|
+
|
|
347
|
+
**Dependencies**:
|
|
348
|
+
```toml
|
|
349
|
+
[dependencies]
|
|
350
|
+
lucchetto = "0.4.0"
|
|
351
|
+
lucchetto-macros = "0.2.0"
|
|
352
|
+
rb-sys = "0"
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
**Safety Model**:
|
|
356
|
+
- `GvlSafe` trait prevents accessing Ruby objects from GVL-free code
|
|
357
|
+
- Custom types can implement `GvlSafe` if they don't interact with Ruby VM
|
|
358
|
+
- Compile-time verification via trait bounds
|
|
359
|
+
|
|
360
|
+
**Limitations**:
|
|
361
|
+
- ⚠️ **Experimental**: Author notes potential memory bugs and unsafe code
|
|
362
|
+
- ⚠️ **0.4.0 version**: Early stage, API may change
|
|
363
|
+
- ⚠️ **Documentation**: 0% coverage, review source code before use
|
|
364
|
+
- ⚠️ **Cannot access Ruby objects** during GVL-free execution
|
|
365
|
+
|
|
366
|
+
**Potential Integration**:
|
|
367
|
+
|
|
368
|
+
```rust
|
|
369
|
+
use lucchetto::without_gvl;
|
|
370
|
+
|
|
371
|
+
// Long-running extraction that doesn't need Ruby access
|
|
372
|
+
#[without_gvl]
|
|
373
|
+
fn extract_large_pdf_internal(path: String) -> Result<String, String> {
|
|
374
|
+
// GVL released - other Ruby threads can run!
|
|
375
|
+
match kreuzberg::extract_file_sync(&path, None, &Default::default()) {
|
|
376
|
+
Ok(result) => Ok(result.content),
|
|
377
|
+
Err(e) => Err(e.to_string()),
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Ruby-facing wrapper
|
|
382
|
+
fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
383
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
384
|
+
let path: String = args.get(0).unwrap().try_convert()?;
|
|
385
|
+
|
|
386
|
+
// Call GVL-free function
|
|
387
|
+
let content = extract_large_pdf_internal(path)
|
|
388
|
+
.map_err(|e| runtime_error(e))?;
|
|
389
|
+
|
|
390
|
+
// Convert to Ruby (GVL held)
|
|
391
|
+
let hash = ruby.hash_new();
|
|
392
|
+
hash.aset(ruby.intern("content"), content)?;
|
|
393
|
+
Ok(hash)
|
|
394
|
+
}
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
**Performance Impact**:
|
|
398
|
+
- ✅ **Enables true concurrency**: Ruby threads can run during Rust operations
|
|
399
|
+
- ✅ **No GVL blocking**: Long operations don't freeze Ruby runtime
|
|
400
|
+
- ✅ **Thread-level parallelism**: Multiple Ruby threads can process different files
|
|
401
|
+
|
|
402
|
+
**Recommendation**:
|
|
403
|
+
- **Monitor lucchetto development** before production use
|
|
404
|
+
- **Test thoroughly** in development environment
|
|
405
|
+
- **Consider for CPU-bound operations** (PDF extraction, OCR, image processing)
|
|
406
|
+
- **Not recommended yet** for production due to experimental status
|
|
407
|
+
|
|
408
|
+
## Future Improvements
|
|
409
|
+
|
|
410
|
+
Potential areas for async enhancement:
|
|
411
|
+
|
|
412
|
+
1. **Lucchetto integration** - Evaluate for GVL-free extraction (experimental)
|
|
413
|
+
2. **Ruby Fiber integration** - Map Rust futures to Ruby Fibers
|
|
414
|
+
3. **Async OCR backends** - Non-blocking OCR processing with GVL release
|
|
415
|
+
4. **Streaming results** - Chunked extraction without blocking GVL
|
|
416
|
+
|
|
417
|
+
**Contributing**: If you're interested in improving async support, check:
|
|
418
|
+
- Lucchetto crate: https://github.com/Maaarcocr/lucchetto
|
|
419
|
+
- Magnus GitHub Issues: https://github.com/matsadler/magnus/issues
|
|
420
|
+
- `rb_thread_call_without_gvl` discussions
|
|
421
|
+
- Ruby Fiber-based async patterns
|
|
422
|
+
|
|
423
|
+
## Contributing
|
|
424
|
+
|
|
425
|
+
See the main Kreuzberg repository for contribution guidelines.
|