kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +2 -105
- data/README.md +454 -454
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -1
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +7 -80
|
@@ -1,154 +1,154 @@
|
|
|
1
|
-
use std::any::Any;
|
|
2
|
-
use std::time::{SystemTime, UNIX_EPOCH};
|
|
3
|
-
|
|
4
|
-
/// Context information captured when a panic occurs.
|
|
5
|
-
///
|
|
6
|
-
/// This struct stores detailed information about where and when a panic happened,
|
|
7
|
-
/// enabling better error reporting across FFI boundaries.
|
|
8
|
-
#[derive(Debug, Clone)]
|
|
9
|
-
pub struct PanicContext {
|
|
10
|
-
/// Source file where the panic occurred
|
|
11
|
-
pub file: &'static str,
|
|
12
|
-
/// Line number where the panic occurred
|
|
13
|
-
pub line: u32,
|
|
14
|
-
/// Function name where the panic occurred
|
|
15
|
-
pub function: &'static str,
|
|
16
|
-
/// Panic message extracted from the panic payload
|
|
17
|
-
pub message: String,
|
|
18
|
-
/// Timestamp when the panic was captured
|
|
19
|
-
pub timestamp: SystemTime,
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
impl PanicContext {
|
|
23
|
-
/// Creates a new PanicContext with the given parameters.
|
|
24
|
-
///
|
|
25
|
-
/// # Arguments
|
|
26
|
-
///
|
|
27
|
-
/// * `file` - Source file path
|
|
28
|
-
/// * `line` - Line number
|
|
29
|
-
/// * `function` - Function name
|
|
30
|
-
/// * `panic_info` - The panic payload to extract message from
|
|
31
|
-
pub fn new(file: &'static str, line: u32, function: &'static str, panic_info: &dyn Any) -> Self {
|
|
32
|
-
let timestamp = std::panic::catch_unwind(SystemTime::now).unwrap_or(UNIX_EPOCH);
|
|
33
|
-
|
|
34
|
-
Self {
|
|
35
|
-
file,
|
|
36
|
-
line,
|
|
37
|
-
function,
|
|
38
|
-
message: extract_panic_message(panic_info),
|
|
39
|
-
timestamp,
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
/// Formats the panic context as a human-readable string.
|
|
44
|
-
pub fn format(&self) -> String {
|
|
45
|
-
format!(
|
|
46
|
-
"Panic at {}:{}:{} - {}",
|
|
47
|
-
self.file, self.line, self.function, self.message
|
|
48
|
-
)
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
/// Maximum panic message length to prevent DoS attacks
|
|
53
|
-
const MAX_PANIC_MESSAGE_LEN: usize = 4096;
|
|
54
|
-
|
|
55
|
-
/// Extracts a human-readable message from a panic payload.
|
|
56
|
-
///
|
|
57
|
-
/// Attempts to downcast the panic payload to common types (String, &str)
|
|
58
|
-
/// to extract a meaningful error message.
|
|
59
|
-
///
|
|
60
|
-
/// Message is truncated to 4KB to prevent DoS attacks via extremely large panic messages.
|
|
61
|
-
///
|
|
62
|
-
/// # Arguments
|
|
63
|
-
///
|
|
64
|
-
/// * `panic_info` - The panic payload from catch_unwind
|
|
65
|
-
///
|
|
66
|
-
/// # Returns
|
|
67
|
-
///
|
|
68
|
-
/// A string representation of the panic message (truncated if necessary)
|
|
69
|
-
pub fn extract_panic_message(panic_info: &dyn Any) -> String {
|
|
70
|
-
let msg = if let Some(s) = panic_info.downcast_ref::<String>() {
|
|
71
|
-
s.clone()
|
|
72
|
-
} else if let Some(s) = panic_info.downcast_ref::<&str>() {
|
|
73
|
-
(*s).to_string()
|
|
74
|
-
} else {
|
|
75
|
-
"Unknown panic payload".to_string()
|
|
76
|
-
};
|
|
77
|
-
|
|
78
|
-
if msg.len() > MAX_PANIC_MESSAGE_LEN {
|
|
79
|
-
let truncate_at = msg.floor_char_boundary(MAX_PANIC_MESSAGE_LEN);
|
|
80
|
-
format!("{}... [truncated]", &msg[..truncate_at])
|
|
81
|
-
} else {
|
|
82
|
-
msg
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
#[cfg(test)]
|
|
87
|
-
mod tests {
|
|
88
|
-
use super::*;
|
|
89
|
-
|
|
90
|
-
#[test]
|
|
91
|
-
fn test_extract_panic_message_string() {
|
|
92
|
-
let panic_msg = "test panic".to_string();
|
|
93
|
-
let msg = extract_panic_message(&panic_msg);
|
|
94
|
-
assert_eq!(msg, "test panic");
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
#[test]
|
|
98
|
-
fn test_extract_panic_message_str() {
|
|
99
|
-
let panic_msg: &str = "test panic";
|
|
100
|
-
let msg = extract_panic_message(&panic_msg);
|
|
101
|
-
assert_eq!(msg, "test panic");
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
#[test]
|
|
105
|
-
fn test_extract_panic_message_unknown() {
|
|
106
|
-
let panic_msg = 42i32;
|
|
107
|
-
let msg = extract_panic_message(&panic_msg);
|
|
108
|
-
assert_eq!(msg, "Unknown panic payload");
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
#[test]
|
|
112
|
-
fn test_panic_context_format() {
|
|
113
|
-
let panic_msg = "test error".to_string();
|
|
114
|
-
let ctx = PanicContext::new("test.rs", 42, "test_function", &panic_msg);
|
|
115
|
-
|
|
116
|
-
let formatted = ctx.format();
|
|
117
|
-
assert!(formatted.contains("test.rs"));
|
|
118
|
-
assert!(formatted.contains("42"));
|
|
119
|
-
assert!(formatted.contains("test_function"));
|
|
120
|
-
assert!(formatted.contains("test error"));
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
#[test]
|
|
124
|
-
fn test_panic_message_truncation() {
|
|
125
|
-
let long_msg = "x".repeat(5000);
|
|
126
|
-
let msg = extract_panic_message(&long_msg);
|
|
127
|
-
assert!(msg.len() <= MAX_PANIC_MESSAGE_LEN + 20);
|
|
128
|
-
assert!(msg.ends_with("... [truncated]"));
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
#[test]
|
|
132
|
-
fn test_panic_message_truncation_utf8_boundary() {
|
|
133
|
-
let mut msg = "x".repeat(4093);
|
|
134
|
-
msg.push('🦀');
|
|
135
|
-
msg.push_str("yyy");
|
|
136
|
-
|
|
137
|
-
let truncated = extract_panic_message(&msg);
|
|
138
|
-
|
|
139
|
-
assert!(truncated.ends_with("... [truncated]"));
|
|
140
|
-
|
|
141
|
-
assert!(std::str::from_utf8(truncated.as_bytes()).is_ok());
|
|
142
|
-
|
|
143
|
-
assert!(!truncated.contains("🦀"));
|
|
144
|
-
assert!(!truncated.contains("yyy"));
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
#[test]
|
|
148
|
-
fn test_panic_message_no_truncation_needed() {
|
|
149
|
-
let short_msg = "short".to_string();
|
|
150
|
-
let msg = extract_panic_message(&short_msg);
|
|
151
|
-
assert_eq!(msg, "short");
|
|
152
|
-
assert!(!msg.contains("[truncated]"));
|
|
153
|
-
}
|
|
154
|
-
}
|
|
1
|
+
use std::any::Any;
|
|
2
|
+
use std::time::{SystemTime, UNIX_EPOCH};
|
|
3
|
+
|
|
4
|
+
/// Context information captured when a panic occurs.
|
|
5
|
+
///
|
|
6
|
+
/// This struct stores detailed information about where and when a panic happened,
|
|
7
|
+
/// enabling better error reporting across FFI boundaries.
|
|
8
|
+
#[derive(Debug, Clone)]
|
|
9
|
+
pub struct PanicContext {
|
|
10
|
+
/// Source file where the panic occurred
|
|
11
|
+
pub file: &'static str,
|
|
12
|
+
/// Line number where the panic occurred
|
|
13
|
+
pub line: u32,
|
|
14
|
+
/// Function name where the panic occurred
|
|
15
|
+
pub function: &'static str,
|
|
16
|
+
/// Panic message extracted from the panic payload
|
|
17
|
+
pub message: String,
|
|
18
|
+
/// Timestamp when the panic was captured
|
|
19
|
+
pub timestamp: SystemTime,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
impl PanicContext {
|
|
23
|
+
/// Creates a new PanicContext with the given parameters.
|
|
24
|
+
///
|
|
25
|
+
/// # Arguments
|
|
26
|
+
///
|
|
27
|
+
/// * `file` - Source file path
|
|
28
|
+
/// * `line` - Line number
|
|
29
|
+
/// * `function` - Function name
|
|
30
|
+
/// * `panic_info` - The panic payload to extract message from
|
|
31
|
+
pub fn new(file: &'static str, line: u32, function: &'static str, panic_info: &dyn Any) -> Self {
|
|
32
|
+
let timestamp = std::panic::catch_unwind(SystemTime::now).unwrap_or(UNIX_EPOCH);
|
|
33
|
+
|
|
34
|
+
Self {
|
|
35
|
+
file,
|
|
36
|
+
line,
|
|
37
|
+
function,
|
|
38
|
+
message: extract_panic_message(panic_info),
|
|
39
|
+
timestamp,
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/// Formats the panic context as a human-readable string.
|
|
44
|
+
pub fn format(&self) -> String {
|
|
45
|
+
format!(
|
|
46
|
+
"Panic at {}:{}:{} - {}",
|
|
47
|
+
self.file, self.line, self.function, self.message
|
|
48
|
+
)
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/// Maximum panic message length to prevent DoS attacks
|
|
53
|
+
const MAX_PANIC_MESSAGE_LEN: usize = 4096;
|
|
54
|
+
|
|
55
|
+
/// Extracts a human-readable message from a panic payload.
|
|
56
|
+
///
|
|
57
|
+
/// Attempts to downcast the panic payload to common types (String, &str)
|
|
58
|
+
/// to extract a meaningful error message.
|
|
59
|
+
///
|
|
60
|
+
/// Message is truncated to 4KB to prevent DoS attacks via extremely large panic messages.
|
|
61
|
+
///
|
|
62
|
+
/// # Arguments
|
|
63
|
+
///
|
|
64
|
+
/// * `panic_info` - The panic payload from catch_unwind
|
|
65
|
+
///
|
|
66
|
+
/// # Returns
|
|
67
|
+
///
|
|
68
|
+
/// A string representation of the panic message (truncated if necessary)
|
|
69
|
+
pub fn extract_panic_message(panic_info: &dyn Any) -> String {
|
|
70
|
+
let msg = if let Some(s) = panic_info.downcast_ref::<String>() {
|
|
71
|
+
s.clone()
|
|
72
|
+
} else if let Some(s) = panic_info.downcast_ref::<&str>() {
|
|
73
|
+
(*s).to_string()
|
|
74
|
+
} else {
|
|
75
|
+
"Unknown panic payload".to_string()
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
if msg.len() > MAX_PANIC_MESSAGE_LEN {
|
|
79
|
+
let truncate_at = msg.floor_char_boundary(MAX_PANIC_MESSAGE_LEN);
|
|
80
|
+
format!("{}... [truncated]", &msg[..truncate_at])
|
|
81
|
+
} else {
|
|
82
|
+
msg
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[cfg(test)]
|
|
87
|
+
mod tests {
|
|
88
|
+
use super::*;
|
|
89
|
+
|
|
90
|
+
#[test]
|
|
91
|
+
fn test_extract_panic_message_string() {
|
|
92
|
+
let panic_msg = "test panic".to_string();
|
|
93
|
+
let msg = extract_panic_message(&panic_msg);
|
|
94
|
+
assert_eq!(msg, "test panic");
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
#[test]
|
|
98
|
+
fn test_extract_panic_message_str() {
|
|
99
|
+
let panic_msg: &str = "test panic";
|
|
100
|
+
let msg = extract_panic_message(&panic_msg);
|
|
101
|
+
assert_eq!(msg, "test panic");
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
#[test]
|
|
105
|
+
fn test_extract_panic_message_unknown() {
|
|
106
|
+
let panic_msg = 42i32;
|
|
107
|
+
let msg = extract_panic_message(&panic_msg);
|
|
108
|
+
assert_eq!(msg, "Unknown panic payload");
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
#[test]
|
|
112
|
+
fn test_panic_context_format() {
|
|
113
|
+
let panic_msg = "test error".to_string();
|
|
114
|
+
let ctx = PanicContext::new("test.rs", 42, "test_function", &panic_msg);
|
|
115
|
+
|
|
116
|
+
let formatted = ctx.format();
|
|
117
|
+
assert!(formatted.contains("test.rs"));
|
|
118
|
+
assert!(formatted.contains("42"));
|
|
119
|
+
assert!(formatted.contains("test_function"));
|
|
120
|
+
assert!(formatted.contains("test error"));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#[test]
|
|
124
|
+
fn test_panic_message_truncation() {
|
|
125
|
+
let long_msg = "x".repeat(5000);
|
|
126
|
+
let msg = extract_panic_message(&long_msg);
|
|
127
|
+
assert!(msg.len() <= MAX_PANIC_MESSAGE_LEN + 20);
|
|
128
|
+
assert!(msg.ends_with("... [truncated]"));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
#[test]
|
|
132
|
+
fn test_panic_message_truncation_utf8_boundary() {
|
|
133
|
+
let mut msg = "x".repeat(4093);
|
|
134
|
+
msg.push('🦀');
|
|
135
|
+
msg.push_str("yyy");
|
|
136
|
+
|
|
137
|
+
let truncated = extract_panic_message(&msg);
|
|
138
|
+
|
|
139
|
+
assert!(truncated.ends_with("... [truncated]"));
|
|
140
|
+
|
|
141
|
+
assert!(std::str::from_utf8(truncated.as_bytes()).is_ok());
|
|
142
|
+
|
|
143
|
+
assert!(!truncated.contains("🦀"));
|
|
144
|
+
assert!(!truncated.contains("yyy"));
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
#[test]
|
|
148
|
+
fn test_panic_message_no_truncation_needed() {
|
|
149
|
+
let short_msg = "short".to_string();
|
|
150
|
+
let msg = extract_panic_message(&short_msg);
|
|
151
|
+
assert_eq!(msg, "short");
|
|
152
|
+
assert!(!msg.contains("[truncated]"));
|
|
153
|
+
}
|
|
154
|
+
}
|
|
@@ -1,44 +1,44 @@
|
|
|
1
|
-
use super::error::PdfError;
|
|
2
|
-
use pdfium_render::prelude::*;
|
|
3
|
-
|
|
4
|
-
pub(crate) fn bind_pdfium(
|
|
5
|
-
map_err: fn(String) -> PdfError,
|
|
6
|
-
context: &'static str,
|
|
7
|
-
) -> Result<Box<dyn PdfiumLibraryBindings>, PdfError> {
|
|
8
|
-
#[cfg(all(feature = "pdf", feature = "
|
|
9
|
-
{
|
|
10
|
-
// WASM target: use dynamic binding to WASM module
|
|
11
|
-
// SAFETY: pdfium-render handles WASM module lifecycle internally.
|
|
12
|
-
// For WASM builds, the PDFium library is linked at compile time
|
|
13
|
-
// and the WASM runtime manages initialization.
|
|
14
|
-
#[cfg(target_arch = "wasm32")]
|
|
15
|
-
{
|
|
16
|
-
Pdfium::bind_to_system_library()
|
|
17
|
-
.map_err(|e| map_err(format!("Failed to initialize Pdfium for WASM ({}): {}", context, e)))
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
// Non-WASM targets: extract and link dynamically
|
|
21
|
-
#[cfg(not(target_arch = "wasm32"))]
|
|
22
|
-
{
|
|
23
|
-
let lib_path = crate::pdf::extract_bundled_pdfium()
|
|
24
|
-
.map_err(|e| map_err(format!("Failed to extract bundled Pdfium ({}): {}", context, e)))?;
|
|
25
|
-
|
|
26
|
-
let lib_dir = lib_path.parent().ok_or_else(|| {
|
|
27
|
-
map_err(format!(
|
|
28
|
-
"Failed to determine Pdfium extraction directory for '{}' ({})",
|
|
29
|
-
lib_path.display(),
|
|
30
|
-
context
|
|
31
|
-
))
|
|
32
|
-
})?;
|
|
33
|
-
|
|
34
|
-
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(lib_dir))
|
|
35
|
-
.map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
#[cfg(all(feature = "pdf", not(feature = "
|
|
40
|
-
{
|
|
41
|
-
Pdfium::bind_to_system_library()
|
|
42
|
-
.map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
|
|
43
|
-
}
|
|
44
|
-
}
|
|
1
|
+
use super::error::PdfError;
|
|
2
|
+
use pdfium_render::prelude::*;
|
|
3
|
+
|
|
4
|
+
pub(crate) fn bind_pdfium(
|
|
5
|
+
map_err: fn(String) -> PdfError,
|
|
6
|
+
context: &'static str,
|
|
7
|
+
) -> Result<Box<dyn PdfiumLibraryBindings>, PdfError> {
|
|
8
|
+
#[cfg(all(feature = "pdf", feature = "bundled-pdfium"))]
|
|
9
|
+
{
|
|
10
|
+
// WASM target: use dynamic binding to WASM module
|
|
11
|
+
// SAFETY: pdfium-render handles WASM module lifecycle internally.
|
|
12
|
+
// For WASM builds, the PDFium library is linked at compile time
|
|
13
|
+
// and the WASM runtime manages initialization.
|
|
14
|
+
#[cfg(target_arch = "wasm32")]
|
|
15
|
+
{
|
|
16
|
+
Pdfium::bind_to_system_library()
|
|
17
|
+
.map_err(|e| map_err(format!("Failed to initialize Pdfium for WASM ({}): {}", context, e)))
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Non-WASM targets: extract and link dynamically
|
|
21
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
22
|
+
{
|
|
23
|
+
let lib_path = crate::pdf::extract_bundled_pdfium()
|
|
24
|
+
.map_err(|e| map_err(format!("Failed to extract bundled Pdfium ({}): {}", context, e)))?;
|
|
25
|
+
|
|
26
|
+
let lib_dir = lib_path.parent().ok_or_else(|| {
|
|
27
|
+
map_err(format!(
|
|
28
|
+
"Failed to determine Pdfium extraction directory for '{}' ({})",
|
|
29
|
+
lib_path.display(),
|
|
30
|
+
context
|
|
31
|
+
))
|
|
32
|
+
})?;
|
|
33
|
+
|
|
34
|
+
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(lib_dir))
|
|
35
|
+
.map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
#[cfg(all(feature = "pdf", not(feature = "bundled-pdfium")))]
|
|
40
|
+
{
|
|
41
|
+
Pdfium::bind_to_system_library()
|
|
42
|
+
.map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
|
|
43
|
+
}
|
|
44
|
+
}
|