kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +4 -104
- data/README.md +454 -432
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6721
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3135
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -182
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -46
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -32
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -85
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -103
- data/lib/pdfium.dll +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -537
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +45 -0
- data/vendor/kreuzberg/Cargo.toml +61 -38
- data/vendor/kreuzberg/README.md +230 -221
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -891
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1171
- data/vendor/kreuzberg/src/embeddings.rs +500 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -569
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -673
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -328
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -66
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -417
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -161
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/build.rs +176 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
- data/vendor/kreuzberg-tesseract/LICENSE +22 -0
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1354 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +44 -81
- data/vendor/rb-sys/bin/release.sh +0 -21
|
@@ -1,381 +1,381 @@
|
|
|
1
|
-
use ahash::AHashMap;
|
|
2
|
-
use chardetng::EncodingDetector;
|
|
3
|
-
use encoding_rs::Encoding;
|
|
4
|
-
use once_cell::sync::Lazy;
|
|
5
|
-
use regex::Regex;
|
|
6
|
-
use std::borrow::Cow;
|
|
7
|
-
use std::collections::VecDeque;
|
|
8
|
-
use std::env;
|
|
9
|
-
use std::sync::RwLock;
|
|
10
|
-
|
|
11
|
-
static CONTROL_CHARS: Lazy<Regex> = Lazy::new(|| {
|
|
12
|
-
Regex::new(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]")
|
|
13
|
-
.expect("Control chars regex pattern is valid and should compile")
|
|
14
|
-
});
|
|
15
|
-
static REPLACEMENT_CHARS: Lazy<Regex> =
|
|
16
|
-
Lazy::new(|| Regex::new(r"\u{FFFD}+").expect("Replacement chars regex pattern is valid and should compile"));
|
|
17
|
-
static ISOLATED_COMBINING: Lazy<Regex> = Lazy::new(|| {
|
|
18
|
-
Regex::new(r"[\u{0300}-\u{036F}]+")
|
|
19
|
-
.expect("Isolated combining diacritics regex pattern is valid and should compile")
|
|
20
|
-
});
|
|
21
|
-
static HEBREW_AS_CYRILLIC: Lazy<Regex> = Lazy::new(|| {
|
|
22
|
-
Regex::new(r"[\u{0400}-\u{04FF}]{3,}")
|
|
23
|
-
.expect("Hebrew misencoded as Cyrillic regex pattern is valid and should compile")
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
struct EncodingCache {
|
|
27
|
-
entries: AHashMap<String, &'static Encoding>,
|
|
28
|
-
order: VecDeque<String>,
|
|
29
|
-
max_entries: usize,
|
|
30
|
-
max_bytes: usize,
|
|
31
|
-
current_bytes: usize,
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
impl EncodingCache {
|
|
35
|
-
fn new(max_entries: usize, max_bytes: usize) -> Self {
|
|
36
|
-
Self {
|
|
37
|
-
entries: AHashMap::new(),
|
|
38
|
-
order: VecDeque::with_capacity(max_entries),
|
|
39
|
-
max_entries,
|
|
40
|
-
max_bytes,
|
|
41
|
-
current_bytes: 0,
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
fn get(&mut self, key: &str) -> Option<&'static Encoding> {
|
|
46
|
-
if let Some(&encoding) = self.entries.get(key) {
|
|
47
|
-
if let Some(pos) = self.order.iter().position(|existing| existing == key)
|
|
48
|
-
&& pos + 1 != self.order.len()
|
|
49
|
-
&& let Some(entry) = self.order.remove(pos)
|
|
50
|
-
{
|
|
51
|
-
self.order.push_back(entry);
|
|
52
|
-
}
|
|
53
|
-
return Some(encoding);
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
None
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
fn insert(&mut self, key: String, encoding: &'static Encoding) {
|
|
60
|
-
let key_len = key.len();
|
|
61
|
-
|
|
62
|
-
if let Some(pos) = self.order.iter().position(|existing| existing == &key) {
|
|
63
|
-
self.order.remove(pos);
|
|
64
|
-
self.current_bytes = self.current_bytes.saturating_sub(key_len);
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
if self.entries.contains_key(&key) {
|
|
68
|
-
self.current_bytes = self.current_bytes.saturating_sub(key_len);
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
self.entries.insert(key.clone(), encoding);
|
|
72
|
-
self.current_bytes = self.current_bytes.saturating_add(key_len);
|
|
73
|
-
self.order.push_back(key);
|
|
74
|
-
|
|
75
|
-
self.enforce_bounds();
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
fn enforce_bounds(&mut self) {
|
|
79
|
-
while self.order.len() > self.max_entries || self.current_bytes > self.max_bytes {
|
|
80
|
-
if let Some(oldest) = self.order.pop_front() {
|
|
81
|
-
if self.entries.remove(&oldest).is_some() {
|
|
82
|
-
self.current_bytes = self.current_bytes.saturating_sub(oldest.len());
|
|
83
|
-
}
|
|
84
|
-
} else {
|
|
85
|
-
break;
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
#[cfg(test)]
|
|
91
|
-
fn clear(&mut self) {
|
|
92
|
-
self.entries.clear();
|
|
93
|
-
self.order.clear();
|
|
94
|
-
self.current_bytes = 0;
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
#[cfg(test)]
|
|
98
|
-
fn set_limits(&mut self, max_entries: usize, max_bytes: usize) {
|
|
99
|
-
self.max_entries = max_entries.max(1);
|
|
100
|
-
self.max_bytes = max_bytes.max(1);
|
|
101
|
-
self.enforce_bounds();
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
const DEFAULT_CACHE_MAX_ENTRIES: usize = 256;
|
|
106
|
-
const DEFAULT_CACHE_MAX_BYTES: usize = 256 * 1024;
|
|
107
|
-
const CACHE_ENV_MAX_ENTRIES: &str = "KREUZBERG_ENCODING_CACHE_MAX_ENTRIES";
|
|
108
|
-
const CACHE_ENV_MAX_BYTES: &str = "KREUZBERG_ENCODING_CACHE_MAX_BYTES";
|
|
109
|
-
|
|
110
|
-
fn cache_limits() -> (usize, usize) {
|
|
111
|
-
let max_entries = env::var(CACHE_ENV_MAX_ENTRIES)
|
|
112
|
-
.ok()
|
|
113
|
-
.and_then(|val| val.parse::<usize>().ok())
|
|
114
|
-
.filter(|&v| v > 0)
|
|
115
|
-
.unwrap_or(DEFAULT_CACHE_MAX_ENTRIES);
|
|
116
|
-
|
|
117
|
-
let max_bytes = env::var(CACHE_ENV_MAX_BYTES)
|
|
118
|
-
.ok()
|
|
119
|
-
.and_then(|val| val.parse::<usize>().ok())
|
|
120
|
-
.filter(|&v| v >= 1)
|
|
121
|
-
.unwrap_or(DEFAULT_CACHE_MAX_BYTES);
|
|
122
|
-
|
|
123
|
-
(max_entries, max_bytes)
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
static ENCODING_CACHE: Lazy<RwLock<EncodingCache>> = Lazy::new(|| {
|
|
127
|
-
let (entries, bytes) = cache_limits();
|
|
128
|
-
RwLock::new(EncodingCache::new(entries, bytes))
|
|
129
|
-
});
|
|
130
|
-
|
|
131
|
-
#[inline]
|
|
132
|
-
fn chain_replacements<'a>(mut text: Cow<'a, str>, replacements: &[(&Regex, &str)]) -> Cow<'a, str> {
|
|
133
|
-
for (pattern, replacement) in replacements {
|
|
134
|
-
if pattern.is_match(&text) {
|
|
135
|
-
text = Cow::Owned(pattern.replace_all(&text, *replacement).into_owned());
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
text
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
fn calculate_cache_key(data: &[u8]) -> String {
|
|
142
|
-
use ahash::AHasher;
|
|
143
|
-
use std::hash::{Hash, Hasher};
|
|
144
|
-
|
|
145
|
-
let mut hasher = AHasher::default();
|
|
146
|
-
let sample = if data.len() > 1024 { &data[..1024] } else { data };
|
|
147
|
-
sample.hash(&mut hasher);
|
|
148
|
-
data.len().hash(&mut hasher);
|
|
149
|
-
format!("{:x}", hasher.finish())
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
/// Decode raw bytes into UTF-8, using heuristics and fallback encodings when necessary.
|
|
153
|
-
///
|
|
154
|
-
/// The function prefers an explicit `encoding`, falls back to the cached guess, probes
|
|
155
|
-
/// an encoding detector, and finally tries a small curated list before returning a
|
|
156
|
-
/// mojibake-cleaned string.
|
|
157
|
-
pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
|
|
158
|
-
if byte_data.is_empty() {
|
|
159
|
-
return String::new();
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
if let Some(enc_name) = encoding
|
|
163
|
-
&& let Some(enc) = Encoding::for_label(enc_name.as_bytes())
|
|
164
|
-
{
|
|
165
|
-
let (decoded, _, _) = enc.decode(byte_data);
|
|
166
|
-
return fix_mojibake_internal(&decoded);
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
let cache_key = calculate_cache_key(byte_data);
|
|
170
|
-
|
|
171
|
-
// OSError/RuntimeError must bubble up - system errors need user reports ~keep
|
|
172
|
-
match ENCODING_CACHE.write() {
|
|
173
|
-
Ok(mut cache) => {
|
|
174
|
-
if let Some(cached_encoding) = cache.get(&cache_key) {
|
|
175
|
-
let (decoded, _, _) = cached_encoding.decode(byte_data);
|
|
176
|
-
return fix_mojibake_internal(&decoded);
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
Err(e) => {
|
|
180
|
-
// Lock poisoning should never happen in normal operation ~keep
|
|
181
|
-
tracing::debug!(error = %e, "encoding cache read lock poisoned; continuing without cache");
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
let mut detector = EncodingDetector::new();
|
|
186
|
-
detector.feed(byte_data, true);
|
|
187
|
-
let encoding = detector.guess(None, true);
|
|
188
|
-
|
|
189
|
-
// OSError/RuntimeError must bubble up - system errors need user reports ~keep
|
|
190
|
-
match ENCODING_CACHE.write() {
|
|
191
|
-
Ok(mut cache) => {
|
|
192
|
-
cache.insert(cache_key, encoding);
|
|
193
|
-
}
|
|
194
|
-
Err(e) => {
|
|
195
|
-
// Lock poisoning should never happen in normal operation ~keep
|
|
196
|
-
tracing::debug!(error = %e, "encoding cache write lock poisoned; continuing without cache");
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
let (decoded, _, had_errors) = encoding.decode(byte_data);
|
|
201
|
-
|
|
202
|
-
if had_errors {
|
|
203
|
-
for enc_name in &[
|
|
204
|
-
"windows-1255",
|
|
205
|
-
"iso-8859-8",
|
|
206
|
-
"windows-1256",
|
|
207
|
-
"iso-8859-6",
|
|
208
|
-
"windows-1252",
|
|
209
|
-
"cp1251",
|
|
210
|
-
] {
|
|
211
|
-
if let Some(enc) = Encoding::for_label(enc_name.as_bytes()) {
|
|
212
|
-
let (test_decoded, _, test_errors) = enc.decode(byte_data);
|
|
213
|
-
if !test_errors && calculate_text_confidence_internal(&test_decoded) > 0.5 {
|
|
214
|
-
return fix_mojibake_internal(&test_decoded);
|
|
215
|
-
}
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
let final_text = fix_mojibake_internal(&decoded);
|
|
221
|
-
|
|
222
|
-
if had_errors {
|
|
223
|
-
let confidence = calculate_text_confidence_internal(&final_text);
|
|
224
|
-
if confidence < 0.6 {
|
|
225
|
-
let preview: String = final_text.chars().filter(|c| !c.is_control()).take(80).collect();
|
|
226
|
-
|
|
227
|
-
tracing::debug!(
|
|
228
|
-
target: "kreuzberg::encoding",
|
|
229
|
-
"safe_decode produced low-confidence output after fallback attempts; encoding={}, confidence={:.3}, len={}, preview=\"{}\"",
|
|
230
|
-
encoding.name(),
|
|
231
|
-
confidence,
|
|
232
|
-
final_text.len(),
|
|
233
|
-
preview
|
|
234
|
-
);
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
final_text
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
/// Estimate how trustworthy a decoded string is on a 0.0–1.0 scale.
|
|
242
|
-
///
|
|
243
|
-
/// Scores close to 1.0 indicate mostly printable characters, whereas lower scores
|
|
244
|
-
/// point to mojibake, control characters, or suspicious character mixes.
|
|
245
|
-
pub fn calculate_text_confidence(text: &str) -> f64 {
|
|
246
|
-
calculate_text_confidence_internal(text)
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
fn calculate_text_confidence_internal(text: &str) -> f64 {
|
|
250
|
-
if text.is_empty() {
|
|
251
|
-
return 0.0;
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
let total_chars = text.len() as f64;
|
|
255
|
-
|
|
256
|
-
let replacement_count = REPLACEMENT_CHARS.find_iter(text).count() as f64;
|
|
257
|
-
let control_count = CONTROL_CHARS.find_iter(text).count() as f64;
|
|
258
|
-
|
|
259
|
-
let penalty = (replacement_count + control_count * 2.0) / total_chars;
|
|
260
|
-
|
|
261
|
-
let readable_chars = text
|
|
262
|
-
.chars()
|
|
263
|
-
.filter(|c| c.is_ascii_graphic() || c.is_whitespace())
|
|
264
|
-
.count() as f64;
|
|
265
|
-
|
|
266
|
-
let readability_score = readable_chars / total_chars;
|
|
267
|
-
|
|
268
|
-
let cyrillic_matches = HEBREW_AS_CYRILLIC.find_iter(text);
|
|
269
|
-
let cyrillic_length: usize = cyrillic_matches.map(|m| m.len()).sum();
|
|
270
|
-
|
|
271
|
-
let mut final_penalty = penalty;
|
|
272
|
-
if cyrillic_length as f64 > total_chars * 0.1 {
|
|
273
|
-
final_penalty += 0.3;
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
(readability_score - final_penalty).clamp(0.0, 1.0)
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
/// Strip control characters and replacement glyphs that typically arise from mojibake.
|
|
280
|
-
pub fn fix_mojibake(text: &str) -> String {
|
|
281
|
-
fix_mojibake_internal(text)
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
fn fix_mojibake_internal(text: &str) -> String {
|
|
285
|
-
if text.is_empty() {
|
|
286
|
-
return text.to_string();
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
let replacements = [
|
|
290
|
-
(&*CONTROL_CHARS, ""),
|
|
291
|
-
(&*REPLACEMENT_CHARS, ""),
|
|
292
|
-
(&*ISOLATED_COMBINING, ""),
|
|
293
|
-
];
|
|
294
|
-
|
|
295
|
-
chain_replacements(Cow::Borrowed(text), &replacements).into_owned()
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
#[cfg(test)]
|
|
299
|
-
mod tests {
|
|
300
|
-
use super::*;
|
|
301
|
-
use encoding_rs::Encoding;
|
|
302
|
-
|
|
303
|
-
#[test]
|
|
304
|
-
fn test_safe_decode_empty() {
|
|
305
|
-
assert_eq!(safe_decode(b"", None), "");
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
#[test]
|
|
309
|
-
fn test_safe_decode_ascii() {
|
|
310
|
-
let text = b"Hello, World!";
|
|
311
|
-
assert_eq!(safe_decode(text, None), "Hello, World!");
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
#[test]
|
|
315
|
-
fn test_safe_decode_utf8() {
|
|
316
|
-
let text = "Hello, 世界! مرحبا".as_bytes();
|
|
317
|
-
assert_eq!(safe_decode(text, None), "Hello, 世界! مرحبا");
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
#[test]
|
|
321
|
-
fn test_encoding_cache_eviction() {
|
|
322
|
-
let mut cache = ENCODING_CACHE.write().unwrap();
|
|
323
|
-
cache.clear();
|
|
324
|
-
cache.set_limits(4, 64);
|
|
325
|
-
|
|
326
|
-
let encoding = Encoding::for_label(b"utf-8").expect("utf-8 encoding should exist");
|
|
327
|
-
|
|
328
|
-
for i in 0..8 {
|
|
329
|
-
cache.insert(format!("key{}", i), encoding);
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
assert!(cache.entries.len() <= 4);
|
|
333
|
-
assert!(!cache.entries.contains_key("key0"));
|
|
334
|
-
assert!(cache.entries.contains_key("key7"));
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
#[test]
|
|
338
|
-
fn test_encoding_cache_byte_limit_eviction() {
|
|
339
|
-
let mut cache = ENCODING_CACHE.write().unwrap();
|
|
340
|
-
cache.clear();
|
|
341
|
-
cache.set_limits(16, 16);
|
|
342
|
-
|
|
343
|
-
let encoding = Encoding::for_label(b"utf-8").expect("utf-8 encoding should exist");
|
|
344
|
-
|
|
345
|
-
cache.insert("short".to_string(), encoding);
|
|
346
|
-
cache.insert("much-longer-key".to_string(), encoding);
|
|
347
|
-
|
|
348
|
-
assert!(cache.entries.contains_key("much-longer-key"));
|
|
349
|
-
assert!(!cache.entries.contains_key("short"));
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
#[test]
|
|
353
|
-
fn test_calculate_text_confidence_empty() {
|
|
354
|
-
assert_eq!(calculate_text_confidence(""), 0.0);
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
#[test]
|
|
358
|
-
fn test_calculate_text_confidence_clean_text() {
|
|
359
|
-
let text = "This is clean, readable text without any issues.";
|
|
360
|
-
let confidence = calculate_text_confidence(text);
|
|
361
|
-
assert!(confidence > 0.9);
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
#[test]
|
|
365
|
-
fn test_fix_mojibake_empty() {
|
|
366
|
-
assert_eq!(fix_mojibake(""), "");
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
#[test]
|
|
370
|
-
fn test_fix_mojibake_clean_text() {
|
|
371
|
-
let text = "Clean text without mojibake";
|
|
372
|
-
assert_eq!(fix_mojibake(text), text);
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
#[test]
|
|
376
|
-
fn test_fix_mojibake_control_chars() {
|
|
377
|
-
let text = "Text\x00with\x01control\x1Fchars";
|
|
378
|
-
let fixed = fix_mojibake(text);
|
|
379
|
-
assert_eq!(fixed, "Textwithcontrolchars");
|
|
380
|
-
}
|
|
381
|
-
}
|
|
1
|
+
use ahash::AHashMap;
|
|
2
|
+
use chardetng::EncodingDetector;
|
|
3
|
+
use encoding_rs::Encoding;
|
|
4
|
+
use once_cell::sync::Lazy;
|
|
5
|
+
use regex::Regex;
|
|
6
|
+
use std::borrow::Cow;
|
|
7
|
+
use std::collections::VecDeque;
|
|
8
|
+
use std::env;
|
|
9
|
+
use std::sync::RwLock;
|
|
10
|
+
|
|
11
|
+
static CONTROL_CHARS: Lazy<Regex> = Lazy::new(|| {
|
|
12
|
+
Regex::new(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]")
|
|
13
|
+
.expect("Control chars regex pattern is valid and should compile")
|
|
14
|
+
});
|
|
15
|
+
static REPLACEMENT_CHARS: Lazy<Regex> =
|
|
16
|
+
Lazy::new(|| Regex::new(r"\u{FFFD}+").expect("Replacement chars regex pattern is valid and should compile"));
|
|
17
|
+
static ISOLATED_COMBINING: Lazy<Regex> = Lazy::new(|| {
|
|
18
|
+
Regex::new(r"[\u{0300}-\u{036F}]+")
|
|
19
|
+
.expect("Isolated combining diacritics regex pattern is valid and should compile")
|
|
20
|
+
});
|
|
21
|
+
static HEBREW_AS_CYRILLIC: Lazy<Regex> = Lazy::new(|| {
|
|
22
|
+
Regex::new(r"[\u{0400}-\u{04FF}]{3,}")
|
|
23
|
+
.expect("Hebrew misencoded as Cyrillic regex pattern is valid and should compile")
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
struct EncodingCache {
|
|
27
|
+
entries: AHashMap<String, &'static Encoding>,
|
|
28
|
+
order: VecDeque<String>,
|
|
29
|
+
max_entries: usize,
|
|
30
|
+
max_bytes: usize,
|
|
31
|
+
current_bytes: usize,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
impl EncodingCache {
|
|
35
|
+
fn new(max_entries: usize, max_bytes: usize) -> Self {
|
|
36
|
+
Self {
|
|
37
|
+
entries: AHashMap::new(),
|
|
38
|
+
order: VecDeque::with_capacity(max_entries),
|
|
39
|
+
max_entries,
|
|
40
|
+
max_bytes,
|
|
41
|
+
current_bytes: 0,
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
fn get(&mut self, key: &str) -> Option<&'static Encoding> {
|
|
46
|
+
if let Some(&encoding) = self.entries.get(key) {
|
|
47
|
+
if let Some(pos) = self.order.iter().position(|existing| existing == key)
|
|
48
|
+
&& pos + 1 != self.order.len()
|
|
49
|
+
&& let Some(entry) = self.order.remove(pos)
|
|
50
|
+
{
|
|
51
|
+
self.order.push_back(entry);
|
|
52
|
+
}
|
|
53
|
+
return Some(encoding);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
None
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
fn insert(&mut self, key: String, encoding: &'static Encoding) {
|
|
60
|
+
let key_len = key.len();
|
|
61
|
+
|
|
62
|
+
if let Some(pos) = self.order.iter().position(|existing| existing == &key) {
|
|
63
|
+
self.order.remove(pos);
|
|
64
|
+
self.current_bytes = self.current_bytes.saturating_sub(key_len);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if self.entries.contains_key(&key) {
|
|
68
|
+
self.current_bytes = self.current_bytes.saturating_sub(key_len);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
self.entries.insert(key.clone(), encoding);
|
|
72
|
+
self.current_bytes = self.current_bytes.saturating_add(key_len);
|
|
73
|
+
self.order.push_back(key);
|
|
74
|
+
|
|
75
|
+
self.enforce_bounds();
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
fn enforce_bounds(&mut self) {
|
|
79
|
+
while self.order.len() > self.max_entries || self.current_bytes > self.max_bytes {
|
|
80
|
+
if let Some(oldest) = self.order.pop_front() {
|
|
81
|
+
if self.entries.remove(&oldest).is_some() {
|
|
82
|
+
self.current_bytes = self.current_bytes.saturating_sub(oldest.len());
|
|
83
|
+
}
|
|
84
|
+
} else {
|
|
85
|
+
break;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#[cfg(test)]
|
|
91
|
+
fn clear(&mut self) {
|
|
92
|
+
self.entries.clear();
|
|
93
|
+
self.order.clear();
|
|
94
|
+
self.current_bytes = 0;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
#[cfg(test)]
|
|
98
|
+
fn set_limits(&mut self, max_entries: usize, max_bytes: usize) {
|
|
99
|
+
self.max_entries = max_entries.max(1);
|
|
100
|
+
self.max_bytes = max_bytes.max(1);
|
|
101
|
+
self.enforce_bounds();
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
const DEFAULT_CACHE_MAX_ENTRIES: usize = 256;
|
|
106
|
+
const DEFAULT_CACHE_MAX_BYTES: usize = 256 * 1024;
|
|
107
|
+
const CACHE_ENV_MAX_ENTRIES: &str = "KREUZBERG_ENCODING_CACHE_MAX_ENTRIES";
|
|
108
|
+
const CACHE_ENV_MAX_BYTES: &str = "KREUZBERG_ENCODING_CACHE_MAX_BYTES";
|
|
109
|
+
|
|
110
|
+
fn cache_limits() -> (usize, usize) {
|
|
111
|
+
let max_entries = env::var(CACHE_ENV_MAX_ENTRIES)
|
|
112
|
+
.ok()
|
|
113
|
+
.and_then(|val| val.parse::<usize>().ok())
|
|
114
|
+
.filter(|&v| v > 0)
|
|
115
|
+
.unwrap_or(DEFAULT_CACHE_MAX_ENTRIES);
|
|
116
|
+
|
|
117
|
+
let max_bytes = env::var(CACHE_ENV_MAX_BYTES)
|
|
118
|
+
.ok()
|
|
119
|
+
.and_then(|val| val.parse::<usize>().ok())
|
|
120
|
+
.filter(|&v| v >= 1)
|
|
121
|
+
.unwrap_or(DEFAULT_CACHE_MAX_BYTES);
|
|
122
|
+
|
|
123
|
+
(max_entries, max_bytes)
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
static ENCODING_CACHE: Lazy<RwLock<EncodingCache>> = Lazy::new(|| {
|
|
127
|
+
let (entries, bytes) = cache_limits();
|
|
128
|
+
RwLock::new(EncodingCache::new(entries, bytes))
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
#[inline]
|
|
132
|
+
fn chain_replacements<'a>(mut text: Cow<'a, str>, replacements: &[(&Regex, &str)]) -> Cow<'a, str> {
|
|
133
|
+
for (pattern, replacement) in replacements {
|
|
134
|
+
if pattern.is_match(&text) {
|
|
135
|
+
text = Cow::Owned(pattern.replace_all(&text, *replacement).into_owned());
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
text
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
fn calculate_cache_key(data: &[u8]) -> String {
|
|
142
|
+
use ahash::AHasher;
|
|
143
|
+
use std::hash::{Hash, Hasher};
|
|
144
|
+
|
|
145
|
+
let mut hasher = AHasher::default();
|
|
146
|
+
let sample = if data.len() > 1024 { &data[..1024] } else { data };
|
|
147
|
+
sample.hash(&mut hasher);
|
|
148
|
+
data.len().hash(&mut hasher);
|
|
149
|
+
format!("{:x}", hasher.finish())
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/// Decode raw bytes into UTF-8, using heuristics and fallback encodings when necessary.
|
|
153
|
+
///
|
|
154
|
+
/// The function prefers an explicit `encoding`, falls back to the cached guess, probes
|
|
155
|
+
/// an encoding detector, and finally tries a small curated list before returning a
|
|
156
|
+
/// mojibake-cleaned string.
|
|
157
|
+
pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
|
|
158
|
+
if byte_data.is_empty() {
|
|
159
|
+
return String::new();
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if let Some(enc_name) = encoding
|
|
163
|
+
&& let Some(enc) = Encoding::for_label(enc_name.as_bytes())
|
|
164
|
+
{
|
|
165
|
+
let (decoded, _, _) = enc.decode(byte_data);
|
|
166
|
+
return fix_mojibake_internal(&decoded);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
let cache_key = calculate_cache_key(byte_data);
|
|
170
|
+
|
|
171
|
+
// OSError/RuntimeError must bubble up - system errors need user reports ~keep
|
|
172
|
+
match ENCODING_CACHE.write() {
|
|
173
|
+
Ok(mut cache) => {
|
|
174
|
+
if let Some(cached_encoding) = cache.get(&cache_key) {
|
|
175
|
+
let (decoded, _, _) = cached_encoding.decode(byte_data);
|
|
176
|
+
return fix_mojibake_internal(&decoded);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
Err(e) => {
|
|
180
|
+
// Lock poisoning should never happen in normal operation ~keep
|
|
181
|
+
tracing::debug!(error = %e, "encoding cache read lock poisoned; continuing without cache");
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
let mut detector = EncodingDetector::new();
|
|
186
|
+
detector.feed(byte_data, true);
|
|
187
|
+
let encoding = detector.guess(None, true);
|
|
188
|
+
|
|
189
|
+
// OSError/RuntimeError must bubble up - system errors need user reports ~keep
|
|
190
|
+
match ENCODING_CACHE.write() {
|
|
191
|
+
Ok(mut cache) => {
|
|
192
|
+
cache.insert(cache_key, encoding);
|
|
193
|
+
}
|
|
194
|
+
Err(e) => {
|
|
195
|
+
// Lock poisoning should never happen in normal operation ~keep
|
|
196
|
+
tracing::debug!(error = %e, "encoding cache write lock poisoned; continuing without cache");
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
let (decoded, _, had_errors) = encoding.decode(byte_data);
|
|
201
|
+
|
|
202
|
+
if had_errors {
|
|
203
|
+
for enc_name in &[
|
|
204
|
+
"windows-1255",
|
|
205
|
+
"iso-8859-8",
|
|
206
|
+
"windows-1256",
|
|
207
|
+
"iso-8859-6",
|
|
208
|
+
"windows-1252",
|
|
209
|
+
"cp1251",
|
|
210
|
+
] {
|
|
211
|
+
if let Some(enc) = Encoding::for_label(enc_name.as_bytes()) {
|
|
212
|
+
let (test_decoded, _, test_errors) = enc.decode(byte_data);
|
|
213
|
+
if !test_errors && calculate_text_confidence_internal(&test_decoded) > 0.5 {
|
|
214
|
+
return fix_mojibake_internal(&test_decoded);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
let final_text = fix_mojibake_internal(&decoded);
|
|
221
|
+
|
|
222
|
+
if had_errors {
|
|
223
|
+
let confidence = calculate_text_confidence_internal(&final_text);
|
|
224
|
+
if confidence < 0.6 {
|
|
225
|
+
let preview: String = final_text.chars().filter(|c| !c.is_control()).take(80).collect();
|
|
226
|
+
|
|
227
|
+
tracing::debug!(
|
|
228
|
+
target: "kreuzberg::encoding",
|
|
229
|
+
"safe_decode produced low-confidence output after fallback attempts; encoding={}, confidence={:.3}, len={}, preview=\"{}\"",
|
|
230
|
+
encoding.name(),
|
|
231
|
+
confidence,
|
|
232
|
+
final_text.len(),
|
|
233
|
+
preview
|
|
234
|
+
);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
final_text
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/// Estimate how trustworthy a decoded string is on a 0.0–1.0 scale.
|
|
242
|
+
///
|
|
243
|
+
/// Scores close to 1.0 indicate mostly printable characters, whereas lower scores
|
|
244
|
+
/// point to mojibake, control characters, or suspicious character mixes.
|
|
245
|
+
pub fn calculate_text_confidence(text: &str) -> f64 {
|
|
246
|
+
calculate_text_confidence_internal(text)
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
fn calculate_text_confidence_internal(text: &str) -> f64 {
|
|
250
|
+
if text.is_empty() {
|
|
251
|
+
return 0.0;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
let total_chars = text.len() as f64;
|
|
255
|
+
|
|
256
|
+
let replacement_count = REPLACEMENT_CHARS.find_iter(text).count() as f64;
|
|
257
|
+
let control_count = CONTROL_CHARS.find_iter(text).count() as f64;
|
|
258
|
+
|
|
259
|
+
let penalty = (replacement_count + control_count * 2.0) / total_chars;
|
|
260
|
+
|
|
261
|
+
let readable_chars = text
|
|
262
|
+
.chars()
|
|
263
|
+
.filter(|c| c.is_ascii_graphic() || c.is_whitespace())
|
|
264
|
+
.count() as f64;
|
|
265
|
+
|
|
266
|
+
let readability_score = readable_chars / total_chars;
|
|
267
|
+
|
|
268
|
+
let cyrillic_matches = HEBREW_AS_CYRILLIC.find_iter(text);
|
|
269
|
+
let cyrillic_length: usize = cyrillic_matches.map(|m| m.len()).sum();
|
|
270
|
+
|
|
271
|
+
let mut final_penalty = penalty;
|
|
272
|
+
if cyrillic_length as f64 > total_chars * 0.1 {
|
|
273
|
+
final_penalty += 0.3;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
(readability_score - final_penalty).clamp(0.0, 1.0)
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
/// Strip control characters and replacement glyphs that typically arise from mojibake.
|
|
280
|
+
pub fn fix_mojibake(text: &str) -> String {
|
|
281
|
+
fix_mojibake_internal(text)
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
fn fix_mojibake_internal(text: &str) -> String {
|
|
285
|
+
if text.is_empty() {
|
|
286
|
+
return text.to_string();
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
let replacements = [
|
|
290
|
+
(&*CONTROL_CHARS, ""),
|
|
291
|
+
(&*REPLACEMENT_CHARS, ""),
|
|
292
|
+
(&*ISOLATED_COMBINING, ""),
|
|
293
|
+
];
|
|
294
|
+
|
|
295
|
+
chain_replacements(Cow::Borrowed(text), &replacements).into_owned()
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
#[cfg(test)]
|
|
299
|
+
mod tests {
|
|
300
|
+
use super::*;
|
|
301
|
+
use encoding_rs::Encoding;
|
|
302
|
+
|
|
303
|
+
#[test]
|
|
304
|
+
fn test_safe_decode_empty() {
|
|
305
|
+
assert_eq!(safe_decode(b"", None), "");
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
#[test]
|
|
309
|
+
fn test_safe_decode_ascii() {
|
|
310
|
+
let text = b"Hello, World!";
|
|
311
|
+
assert_eq!(safe_decode(text, None), "Hello, World!");
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
#[test]
|
|
315
|
+
fn test_safe_decode_utf8() {
|
|
316
|
+
let text = "Hello, 世界! مرحبا".as_bytes();
|
|
317
|
+
assert_eq!(safe_decode(text, None), "Hello, 世界! مرحبا");
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
#[test]
|
|
321
|
+
fn test_encoding_cache_eviction() {
|
|
322
|
+
let mut cache = ENCODING_CACHE.write().unwrap();
|
|
323
|
+
cache.clear();
|
|
324
|
+
cache.set_limits(4, 64);
|
|
325
|
+
|
|
326
|
+
let encoding = Encoding::for_label(b"utf-8").expect("utf-8 encoding should exist");
|
|
327
|
+
|
|
328
|
+
for i in 0..8 {
|
|
329
|
+
cache.insert(format!("key{}", i), encoding);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
assert!(cache.entries.len() <= 4);
|
|
333
|
+
assert!(!cache.entries.contains_key("key0"));
|
|
334
|
+
assert!(cache.entries.contains_key("key7"));
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
#[test]
|
|
338
|
+
fn test_encoding_cache_byte_limit_eviction() {
|
|
339
|
+
let mut cache = ENCODING_CACHE.write().unwrap();
|
|
340
|
+
cache.clear();
|
|
341
|
+
cache.set_limits(16, 16);
|
|
342
|
+
|
|
343
|
+
let encoding = Encoding::for_label(b"utf-8").expect("utf-8 encoding should exist");
|
|
344
|
+
|
|
345
|
+
cache.insert("short".to_string(), encoding);
|
|
346
|
+
cache.insert("much-longer-key".to_string(), encoding);
|
|
347
|
+
|
|
348
|
+
assert!(cache.entries.contains_key("much-longer-key"));
|
|
349
|
+
assert!(!cache.entries.contains_key("short"));
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
#[test]
|
|
353
|
+
fn test_calculate_text_confidence_empty() {
|
|
354
|
+
assert_eq!(calculate_text_confidence(""), 0.0);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
#[test]
|
|
358
|
+
fn test_calculate_text_confidence_clean_text() {
|
|
359
|
+
let text = "This is clean, readable text without any issues.";
|
|
360
|
+
let confidence = calculate_text_confidence(text);
|
|
361
|
+
assert!(confidence > 0.9);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
#[test]
|
|
365
|
+
fn test_fix_mojibake_empty() {
|
|
366
|
+
assert_eq!(fix_mojibake(""), "");
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
#[test]
|
|
370
|
+
fn test_fix_mojibake_clean_text() {
|
|
371
|
+
let text = "Clean text without mojibake";
|
|
372
|
+
assert_eq!(fix_mojibake(text), text);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
#[test]
|
|
376
|
+
fn test_fix_mojibake_control_chars() {
|
|
377
|
+
let text = "Text\x00with\x01control\x1Fchars";
|
|
378
|
+
let fixed = fix_mojibake(text);
|
|
379
|
+
assert_eq!(fixed, "Textwithcontrolchars");
|
|
380
|
+
}
|
|
381
|
+
}
|