kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +105 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +73 -4
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
|
@@ -1,183 +1,183 @@
|
|
|
1
|
-
use crate::TesseractAPI;
|
|
2
|
-
use crate::error::{Result, TesseractError};
|
|
3
|
-
use std::ffi::{CStr, CString};
|
|
4
|
-
use std::os::raw::{c_char, c_int, c_void};
|
|
5
|
-
use std::sync::Arc;
|
|
6
|
-
use std::sync::Mutex;
|
|
7
|
-
|
|
8
|
-
pub struct TessResultRenderer {
|
|
9
|
-
handle: Arc<Mutex<*mut c_void>>,
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
unsafe impl Send for TessResultRenderer {}
|
|
13
|
-
unsafe impl Sync for TessResultRenderer {}
|
|
14
|
-
|
|
15
|
-
impl TessResultRenderer {
|
|
16
|
-
/// Creates a new instance of the TessResultRenderer.
|
|
17
|
-
///
|
|
18
|
-
/// # Arguments
|
|
19
|
-
///
|
|
20
|
-
/// * `outputbase` - Output base path.
|
|
21
|
-
///
|
|
22
|
-
/// # Returns
|
|
23
|
-
///
|
|
24
|
-
/// Returns the new instance of the TessResultRenderer.
|
|
25
|
-
pub fn new_text_renderer(outputbase: &str) -> Result<Self> {
|
|
26
|
-
let outputbase = CString::new(outputbase).unwrap();
|
|
27
|
-
let handle = unsafe { TessTextRendererCreate(outputbase.as_ptr()) };
|
|
28
|
-
if handle.is_null() {
|
|
29
|
-
Err(TesseractError::NullPointerError)
|
|
30
|
-
} else {
|
|
31
|
-
Ok(TessResultRenderer {
|
|
32
|
-
handle: Arc::new(Mutex::new(handle)),
|
|
33
|
-
})
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
/// Creates a new instance of the TessResultRenderer for HOCR.
|
|
38
|
-
///
|
|
39
|
-
/// # Arguments
|
|
40
|
-
///
|
|
41
|
-
/// * `outputbase` - Output base path.
|
|
42
|
-
///
|
|
43
|
-
/// # Returns
|
|
44
|
-
///
|
|
45
|
-
/// Returns the new instance of the TessResultRenderer.
|
|
46
|
-
pub fn new_hocr_renderer(outputbase: &str) -> Result<Self> {
|
|
47
|
-
let outputbase = CString::new(outputbase).unwrap();
|
|
48
|
-
let handle = unsafe { TessHOcrRendererCreate(outputbase.as_ptr()) };
|
|
49
|
-
if handle.is_null() {
|
|
50
|
-
Err(TesseractError::NullPointerError)
|
|
51
|
-
} else {
|
|
52
|
-
Ok(TessResultRenderer {
|
|
53
|
-
handle: Arc::new(Mutex::new(handle)),
|
|
54
|
-
})
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
/// Creates a new instance of the TessResultRenderer for PDF.
|
|
59
|
-
///
|
|
60
|
-
/// # Arguments
|
|
61
|
-
///
|
|
62
|
-
/// * `outputbase` - Output base path.
|
|
63
|
-
/// * `datadir` - Data directory path.
|
|
64
|
-
/// * `textonly` - Whether to include text only.
|
|
65
|
-
///
|
|
66
|
-
/// # Returns
|
|
67
|
-
///
|
|
68
|
-
/// Returns the new instance of the TessResultRenderer.
|
|
69
|
-
pub fn new_pdf_renderer(outputbase: &str, datadir: &str, textonly: bool) -> Result<Self> {
|
|
70
|
-
let outputbase = CString::new(outputbase).unwrap();
|
|
71
|
-
let datadir = CString::new(datadir).unwrap();
|
|
72
|
-
let handle = unsafe { TessPDFRendererCreate(outputbase.as_ptr(), datadir.as_ptr(), textonly as c_int) };
|
|
73
|
-
if handle.is_null() {
|
|
74
|
-
Err(TesseractError::NullPointerError)
|
|
75
|
-
} else {
|
|
76
|
-
Ok(TessResultRenderer {
|
|
77
|
-
handle: Arc::new(Mutex::new(handle)),
|
|
78
|
-
})
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
/// Begins a new document.
|
|
83
|
-
///
|
|
84
|
-
/// # Arguments
|
|
85
|
-
///
|
|
86
|
-
/// * `title` - Title of the document.
|
|
87
|
-
///
|
|
88
|
-
/// # Returns
|
|
89
|
-
///
|
|
90
|
-
/// Returns `true` if the document was created successfully, otherwise returns `false`.
|
|
91
|
-
pub fn begin_document(&self, title: &str) -> bool {
|
|
92
|
-
let title = CString::new(title).unwrap();
|
|
93
|
-
let handle = self.handle.lock().unwrap();
|
|
94
|
-
unsafe { TessResultRendererBeginDocument(*handle, title.as_ptr()) != 0 }
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
/// Adds an image to the document.
|
|
98
|
-
///
|
|
99
|
-
/// # Arguments
|
|
100
|
-
///
|
|
101
|
-
/// * `api` - The TesseractAPI instance.
|
|
102
|
-
///
|
|
103
|
-
/// # Returns
|
|
104
|
-
///
|
|
105
|
-
/// Returns `true` if the image was added successfully, otherwise returns `false`.
|
|
106
|
-
pub fn add_image(&self, api: &TesseractAPI) -> bool {
|
|
107
|
-
let api_handle = api.handle.lock().unwrap();
|
|
108
|
-
let handle = self.handle.lock().unwrap();
|
|
109
|
-
unsafe { TessResultRendererAddImage(*handle, *api_handle) != 0 }
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
/// Ends the document.
|
|
113
|
-
///
|
|
114
|
-
/// # Returns
|
|
115
|
-
///
|
|
116
|
-
/// Returns `true` if the document was ended successfully, otherwise returns `false`.
|
|
117
|
-
pub fn end_document(&self) -> bool {
|
|
118
|
-
let handle = self.handle.lock().unwrap();
|
|
119
|
-
unsafe { TessResultRendererEndDocument(*handle) != 0 }
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
/// Gets the extension of the document.
|
|
123
|
-
///
|
|
124
|
-
/// # Returns
|
|
125
|
-
///
|
|
126
|
-
/// Returns the extension as a `String` if successful, otherwise returns an error.
|
|
127
|
-
pub fn get_extension(&self) -> Result<String> {
|
|
128
|
-
let handle = self.handle.lock().unwrap();
|
|
129
|
-
let ext_ptr = unsafe { TessResultRendererExtention(*handle) };
|
|
130
|
-
if ext_ptr.is_null() {
|
|
131
|
-
Err(TesseractError::NullPointerError)
|
|
132
|
-
} else {
|
|
133
|
-
let c_str = unsafe { CStr::from_ptr(ext_ptr) };
|
|
134
|
-
Ok(c_str.to_str()?.to_owned())
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
/// Gets the title of the document.
|
|
139
|
-
///
|
|
140
|
-
/// # Returns
|
|
141
|
-
///
|
|
142
|
-
/// Returns the title as a `String` if successful, otherwise returns an error.
|
|
143
|
-
pub fn get_title(&self) -> Result<String> {
|
|
144
|
-
let handle = self.handle.lock().unwrap();
|
|
145
|
-
let title_ptr = unsafe { TessResultRendererTitle(*handle) };
|
|
146
|
-
if title_ptr.is_null() {
|
|
147
|
-
Err(TesseractError::NullPointerError)
|
|
148
|
-
} else {
|
|
149
|
-
let c_str = unsafe { CStr::from_ptr(title_ptr) };
|
|
150
|
-
Ok(c_str.to_str()?.to_owned())
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
/// Gets the number of images in the document.
|
|
155
|
-
///
|
|
156
|
-
/// # Returns
|
|
157
|
-
///
|
|
158
|
-
/// Returns the number of images as an `i32`.
|
|
159
|
-
pub fn get_image_num(&self) -> i32 {
|
|
160
|
-
let handle = self.handle.lock().unwrap();
|
|
161
|
-
unsafe { TessResultRendererImageNum(*handle) }
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
impl Drop for TessResultRenderer {
|
|
166
|
-
fn drop(&mut self) {
|
|
167
|
-
let handle = self.handle.lock().unwrap();
|
|
168
|
-
unsafe { TessDeleteResultRenderer(*handle) };
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
unsafe extern "C" {
|
|
173
|
-
pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
|
|
174
|
-
pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
|
|
175
|
-
pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;
|
|
176
|
-
pub fn TessDeleteResultRenderer(renderer: *mut c_void);
|
|
177
|
-
pub fn TessResultRendererBeginDocument(renderer: *mut c_void, title: *const c_char) -> c_int;
|
|
178
|
-
pub fn TessResultRendererAddImage(renderer: *mut c_void, api: *mut c_void) -> c_int;
|
|
179
|
-
pub fn TessResultRendererEndDocument(renderer: *mut c_void) -> c_int;
|
|
180
|
-
pub fn TessResultRendererExtention(renderer: *mut c_void) -> *const c_char;
|
|
181
|
-
pub fn TessResultRendererTitle(renderer: *mut c_void) -> *const c_char;
|
|
182
|
-
pub fn TessResultRendererImageNum(renderer: *mut c_void) -> c_int;
|
|
183
|
-
}
|
|
1
|
+
use crate::TesseractAPI;
|
|
2
|
+
use crate::error::{Result, TesseractError};
|
|
3
|
+
use std::ffi::{CStr, CString};
|
|
4
|
+
use std::os::raw::{c_char, c_int, c_void};
|
|
5
|
+
use std::sync::Arc;
|
|
6
|
+
use std::sync::Mutex;
|
|
7
|
+
|
|
8
|
+
pub struct TessResultRenderer {
|
|
9
|
+
handle: Arc<Mutex<*mut c_void>>,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
unsafe impl Send for TessResultRenderer {}
|
|
13
|
+
unsafe impl Sync for TessResultRenderer {}
|
|
14
|
+
|
|
15
|
+
impl TessResultRenderer {
|
|
16
|
+
/// Creates a new instance of the TessResultRenderer.
|
|
17
|
+
///
|
|
18
|
+
/// # Arguments
|
|
19
|
+
///
|
|
20
|
+
/// * `outputbase` - Output base path.
|
|
21
|
+
///
|
|
22
|
+
/// # Returns
|
|
23
|
+
///
|
|
24
|
+
/// Returns the new instance of the TessResultRenderer.
|
|
25
|
+
pub fn new_text_renderer(outputbase: &str) -> Result<Self> {
|
|
26
|
+
let outputbase = CString::new(outputbase).unwrap();
|
|
27
|
+
let handle = unsafe { TessTextRendererCreate(outputbase.as_ptr()) };
|
|
28
|
+
if handle.is_null() {
|
|
29
|
+
Err(TesseractError::NullPointerError)
|
|
30
|
+
} else {
|
|
31
|
+
Ok(TessResultRenderer {
|
|
32
|
+
handle: Arc::new(Mutex::new(handle)),
|
|
33
|
+
})
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/// Creates a new instance of the TessResultRenderer for HOCR.
|
|
38
|
+
///
|
|
39
|
+
/// # Arguments
|
|
40
|
+
///
|
|
41
|
+
/// * `outputbase` - Output base path.
|
|
42
|
+
///
|
|
43
|
+
/// # Returns
|
|
44
|
+
///
|
|
45
|
+
/// Returns the new instance of the TessResultRenderer.
|
|
46
|
+
pub fn new_hocr_renderer(outputbase: &str) -> Result<Self> {
|
|
47
|
+
let outputbase = CString::new(outputbase).unwrap();
|
|
48
|
+
let handle = unsafe { TessHOcrRendererCreate(outputbase.as_ptr()) };
|
|
49
|
+
if handle.is_null() {
|
|
50
|
+
Err(TesseractError::NullPointerError)
|
|
51
|
+
} else {
|
|
52
|
+
Ok(TessResultRenderer {
|
|
53
|
+
handle: Arc::new(Mutex::new(handle)),
|
|
54
|
+
})
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/// Creates a new instance of the TessResultRenderer for PDF.
|
|
59
|
+
///
|
|
60
|
+
/// # Arguments
|
|
61
|
+
///
|
|
62
|
+
/// * `outputbase` - Output base path.
|
|
63
|
+
/// * `datadir` - Data directory path.
|
|
64
|
+
/// * `textonly` - Whether to include text only.
|
|
65
|
+
///
|
|
66
|
+
/// # Returns
|
|
67
|
+
///
|
|
68
|
+
/// Returns the new instance of the TessResultRenderer.
|
|
69
|
+
pub fn new_pdf_renderer(outputbase: &str, datadir: &str, textonly: bool) -> Result<Self> {
|
|
70
|
+
let outputbase = CString::new(outputbase).unwrap();
|
|
71
|
+
let datadir = CString::new(datadir).unwrap();
|
|
72
|
+
let handle = unsafe { TessPDFRendererCreate(outputbase.as_ptr(), datadir.as_ptr(), textonly as c_int) };
|
|
73
|
+
if handle.is_null() {
|
|
74
|
+
Err(TesseractError::NullPointerError)
|
|
75
|
+
} else {
|
|
76
|
+
Ok(TessResultRenderer {
|
|
77
|
+
handle: Arc::new(Mutex::new(handle)),
|
|
78
|
+
})
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/// Begins a new document.
|
|
83
|
+
///
|
|
84
|
+
/// # Arguments
|
|
85
|
+
///
|
|
86
|
+
/// * `title` - Title of the document.
|
|
87
|
+
///
|
|
88
|
+
/// # Returns
|
|
89
|
+
///
|
|
90
|
+
/// Returns `true` if the document was created successfully, otherwise returns `false`.
|
|
91
|
+
pub fn begin_document(&self, title: &str) -> bool {
|
|
92
|
+
let title = CString::new(title).unwrap();
|
|
93
|
+
let handle = self.handle.lock().unwrap();
|
|
94
|
+
unsafe { TessResultRendererBeginDocument(*handle, title.as_ptr()) != 0 }
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/// Adds an image to the document.
|
|
98
|
+
///
|
|
99
|
+
/// # Arguments
|
|
100
|
+
///
|
|
101
|
+
/// * `api` - The TesseractAPI instance.
|
|
102
|
+
///
|
|
103
|
+
/// # Returns
|
|
104
|
+
///
|
|
105
|
+
/// Returns `true` if the image was added successfully, otherwise returns `false`.
|
|
106
|
+
pub fn add_image(&self, api: &TesseractAPI) -> bool {
|
|
107
|
+
let api_handle = api.handle.lock().unwrap();
|
|
108
|
+
let handle = self.handle.lock().unwrap();
|
|
109
|
+
unsafe { TessResultRendererAddImage(*handle, *api_handle) != 0 }
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/// Ends the document.
|
|
113
|
+
///
|
|
114
|
+
/// # Returns
|
|
115
|
+
///
|
|
116
|
+
/// Returns `true` if the document was ended successfully, otherwise returns `false`.
|
|
117
|
+
pub fn end_document(&self) -> bool {
|
|
118
|
+
let handle = self.handle.lock().unwrap();
|
|
119
|
+
unsafe { TessResultRendererEndDocument(*handle) != 0 }
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/// Gets the extension of the document.
|
|
123
|
+
///
|
|
124
|
+
/// # Returns
|
|
125
|
+
///
|
|
126
|
+
/// Returns the extension as a `String` if successful, otherwise returns an error.
|
|
127
|
+
pub fn get_extension(&self) -> Result<String> {
|
|
128
|
+
let handle = self.handle.lock().unwrap();
|
|
129
|
+
let ext_ptr = unsafe { TessResultRendererExtention(*handle) };
|
|
130
|
+
if ext_ptr.is_null() {
|
|
131
|
+
Err(TesseractError::NullPointerError)
|
|
132
|
+
} else {
|
|
133
|
+
let c_str = unsafe { CStr::from_ptr(ext_ptr) };
|
|
134
|
+
Ok(c_str.to_str()?.to_owned())
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/// Gets the title of the document.
|
|
139
|
+
///
|
|
140
|
+
/// # Returns
|
|
141
|
+
///
|
|
142
|
+
/// Returns the title as a `String` if successful, otherwise returns an error.
|
|
143
|
+
pub fn get_title(&self) -> Result<String> {
|
|
144
|
+
let handle = self.handle.lock().unwrap();
|
|
145
|
+
let title_ptr = unsafe { TessResultRendererTitle(*handle) };
|
|
146
|
+
if title_ptr.is_null() {
|
|
147
|
+
Err(TesseractError::NullPointerError)
|
|
148
|
+
} else {
|
|
149
|
+
let c_str = unsafe { CStr::from_ptr(title_ptr) };
|
|
150
|
+
Ok(c_str.to_str()?.to_owned())
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/// Gets the number of images in the document.
|
|
155
|
+
///
|
|
156
|
+
/// # Returns
|
|
157
|
+
///
|
|
158
|
+
/// Returns the number of images as an `i32`.
|
|
159
|
+
pub fn get_image_num(&self) -> i32 {
|
|
160
|
+
let handle = self.handle.lock().unwrap();
|
|
161
|
+
unsafe { TessResultRendererImageNum(*handle) }
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
impl Drop for TessResultRenderer {
|
|
166
|
+
fn drop(&mut self) {
|
|
167
|
+
let handle = self.handle.lock().unwrap();
|
|
168
|
+
unsafe { TessDeleteResultRenderer(*handle) };
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
unsafe extern "C" {
|
|
173
|
+
pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
|
|
174
|
+
pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
|
|
175
|
+
pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;
|
|
176
|
+
pub fn TessDeleteResultRenderer(renderer: *mut c_void);
|
|
177
|
+
pub fn TessResultRendererBeginDocument(renderer: *mut c_void, title: *const c_char) -> c_int;
|
|
178
|
+
pub fn TessResultRendererAddImage(renderer: *mut c_void, api: *mut c_void) -> c_int;
|
|
179
|
+
pub fn TessResultRendererEndDocument(renderer: *mut c_void) -> c_int;
|
|
180
|
+
pub fn TessResultRendererExtention(renderer: *mut c_void) -> *const c_char;
|
|
181
|
+
pub fn TessResultRendererTitle(renderer: *mut c_void) -> *const c_char;
|
|
182
|
+
pub fn TessResultRendererImageNum(renderer: *mut c_void) -> c_int;
|
|
183
|
+
}
|