kreuzberg 4.0.0.pre.rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +157 -0
- data/README.md +426 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +341 -0
- data/ext/kreuzberg_rb/extconf.rb +45 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +15 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +148 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +46 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +691 -0
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -0
- data/lib/kreuzberg/extraction_api.rb +85 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +80 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +103 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +520 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +204 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -0
- data/vendor/kreuzberg/src/core/mime.rs +605 -0
- data/vendor/kreuzberg/src/core/mod.rs +45 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
- data/vendor/kreuzberg/src/embeddings.rs +432 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
- data/vendor/kreuzberg/src/extractors/email.rs +143 -0
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -0
- data/vendor/kreuzberg/src/extractors/image.rs +198 -0
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
- data/vendor/kreuzberg/src/extractors/text.rs +260 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +105 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +393 -0
- data/vendor/kreuzberg/src/pdf/text.rs +158 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +903 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
- data/vendor/kreuzberg/tests/config_features.rs +598 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
- data/vendor/kreuzberg/tests/core_integration.rs +510 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +536 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
//! Image and OCR integration tests using real image files.
|
|
2
|
+
//!
|
|
3
|
+
//! This module tests image extraction and OCR processing end-to-end with real
|
|
4
|
+
//! image files from the test_documents/ directory. Tests verify that both
|
|
5
|
+
//! image metadata extraction and OCR text extraction work correctly.
|
|
6
|
+
//!
|
|
7
|
+
//! Test philosophy:
|
|
8
|
+
//! - Use real images from test_documents/
|
|
9
|
+
//! - Assert on behavior, not implementation
|
|
10
|
+
//! - Test different image formats (PNG, JPG, BMP, etc.)
|
|
11
|
+
//! - Test OCR with various languages and layouts
|
|
12
|
+
//! - Verify graceful handling of images without text
|
|
13
|
+
|
|
14
|
+
mod helpers;
|
|
15
|
+
|
|
16
|
+
use helpers::*;
|
|
17
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
18
|
+
use kreuzberg::extract_file_sync;
|
|
19
|
+
|
|
20
|
+
#[test]
|
|
21
|
+
fn test_jpg_image_metadata() {
|
|
22
|
+
if skip_if_missing("images/example.jpg") {
|
|
23
|
+
return;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
let file_path = get_test_file_path("images/example.jpg");
|
|
27
|
+
let result = extract_file_sync(&file_path, None, &ExtractionConfig::default())
|
|
28
|
+
.expect("Should extract JPG image successfully");
|
|
29
|
+
|
|
30
|
+
assert_mime_type(&result, "image/jpeg");
|
|
31
|
+
|
|
32
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
33
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
#[test]
|
|
37
|
+
fn test_png_image_metadata() {
|
|
38
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
39
|
+
return;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
43
|
+
let result = extract_file_sync(&file_path, None, &ExtractionConfig::default())
|
|
44
|
+
.expect("Should extract PNG image successfully");
|
|
45
|
+
|
|
46
|
+
assert_mime_type(&result, "image/png");
|
|
47
|
+
|
|
48
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
49
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
#[test]
|
|
53
|
+
fn test_bmp_image_format() {
|
|
54
|
+
if skip_if_missing("images/bmp_24.bmp") {
|
|
55
|
+
return;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
let file_path = get_test_file_path("images/bmp_24.bmp");
|
|
59
|
+
let result = extract_file_sync(&file_path, None, &ExtractionConfig::default())
|
|
60
|
+
.expect("Should extract BMP image successfully");
|
|
61
|
+
|
|
62
|
+
assert_mime_type(&result, "image/bmp");
|
|
63
|
+
|
|
64
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
65
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#[test]
|
|
69
|
+
fn test_ocr_simple_text() {
|
|
70
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
75
|
+
let config = test_config_with_ocr();
|
|
76
|
+
|
|
77
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract text from image with OCR");
|
|
78
|
+
|
|
79
|
+
assert_mime_type(&result, "image/png");
|
|
80
|
+
|
|
81
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
82
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
83
|
+
|
|
84
|
+
if !result.content.trim().is_empty() {
|
|
85
|
+
assert_min_content_length(&result, 5);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
#[test]
|
|
90
|
+
fn test_ocr_document_image() {
|
|
91
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
92
|
+
return;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
96
|
+
let config = test_config_with_ocr();
|
|
97
|
+
|
|
98
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract text from document image");
|
|
99
|
+
|
|
100
|
+
assert_mime_type(&result, "image/jpeg");
|
|
101
|
+
|
|
102
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
103
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
104
|
+
|
|
105
|
+
if !result.content.trim().is_empty() {
|
|
106
|
+
assert_min_content_length(&result, 10);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
#[test]
|
|
111
|
+
fn test_ocr_layout_parser() {
|
|
112
|
+
if skip_if_missing("images/layout_parser_ocr.jpg") {
|
|
113
|
+
return;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
let file_path = get_test_file_path("images/layout_parser_ocr.jpg");
|
|
117
|
+
let config = test_config_with_ocr();
|
|
118
|
+
|
|
119
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract text from layout parser image");
|
|
120
|
+
|
|
121
|
+
assert_mime_type(&result, "image/jpeg");
|
|
122
|
+
|
|
123
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
124
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
125
|
+
|
|
126
|
+
if !result.content.trim().is_empty() {
|
|
127
|
+
assert_min_content_length(&result, 20);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
#[test]
|
|
132
|
+
fn test_ocr_invoice_image() {
|
|
133
|
+
if skip_if_missing("images/invoice_image.png") {
|
|
134
|
+
return;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
let file_path = get_test_file_path("images/invoice_image.png");
|
|
138
|
+
let config = test_config_with_ocr();
|
|
139
|
+
|
|
140
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract text from invoice image");
|
|
141
|
+
|
|
142
|
+
assert_mime_type(&result, "image/png");
|
|
143
|
+
|
|
144
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
145
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
146
|
+
|
|
147
|
+
if !result.content.trim().is_empty() {
|
|
148
|
+
assert_min_content_length(&result, 10);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
#[test]
|
|
153
|
+
fn test_table_image_simple() {
|
|
154
|
+
if skip_if_missing("tables/simple_table.png") {
|
|
155
|
+
return;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
let file_path = get_test_file_path("tables/simple_table.png");
|
|
159
|
+
let config = test_config_with_ocr();
|
|
160
|
+
|
|
161
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract table image successfully");
|
|
162
|
+
|
|
163
|
+
assert_mime_type(&result, "image/png");
|
|
164
|
+
|
|
165
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
166
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
#[test]
|
|
170
|
+
fn test_table_image_complex() {
|
|
171
|
+
if skip_if_missing("tables/complex_document.png") {
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
let file_path = get_test_file_path("tables/complex_document.png");
|
|
176
|
+
let config = test_config_with_ocr();
|
|
177
|
+
|
|
178
|
+
let result =
|
|
179
|
+
extract_file_sync(&file_path, None, &config).expect("Should extract complex document image successfully");
|
|
180
|
+
|
|
181
|
+
assert_mime_type(&result, "image/png");
|
|
182
|
+
|
|
183
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
184
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
#[test]
|
|
188
|
+
fn test_ocr_multilang_english_korean() {
|
|
189
|
+
if skip_if_missing("images/english_and_korean.png") {
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
let file_path = get_test_file_path("images/english_and_korean.png");
|
|
194
|
+
let config = test_config_with_ocr();
|
|
195
|
+
|
|
196
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract mixed language image");
|
|
197
|
+
|
|
198
|
+
assert_mime_type(&result, "image/png");
|
|
199
|
+
|
|
200
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
201
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
#[test]
|
|
205
|
+
fn test_ocr_chinese_simplified() {
|
|
206
|
+
if skip_if_missing("images/chi_sim_image.jpeg") {
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
let file_path = get_test_file_path("images/chi_sim_image.jpeg");
|
|
211
|
+
let config = test_config_with_ocr();
|
|
212
|
+
|
|
213
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should process Chinese image");
|
|
214
|
+
|
|
215
|
+
assert_mime_type(&result, "image/jpeg");
|
|
216
|
+
|
|
217
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
218
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
#[test]
|
|
222
|
+
fn test_ocr_japanese_vertical() {
|
|
223
|
+
if skip_if_missing("images/jpn_vert.jpeg") {
|
|
224
|
+
return;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
let file_path = get_test_file_path("images/jpn_vert.jpeg");
|
|
228
|
+
let config = test_config_with_ocr();
|
|
229
|
+
|
|
230
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should process Japanese vertical text image");
|
|
231
|
+
|
|
232
|
+
assert_mime_type(&result, "image/jpeg");
|
|
233
|
+
|
|
234
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
235
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
#[test]
|
|
239
|
+
fn test_image_no_text() {
|
|
240
|
+
if skip_if_missing("images/flower_no_text.jpg") {
|
|
241
|
+
return;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
let file_path = get_test_file_path("images/flower_no_text.jpg");
|
|
245
|
+
let config = test_config_with_ocr();
|
|
246
|
+
|
|
247
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should process image without text");
|
|
248
|
+
|
|
249
|
+
assert_mime_type(&result, "image/jpeg");
|
|
250
|
+
|
|
251
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
252
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
253
|
+
}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
#![cfg(feature = "otel")]
|
|
2
|
+
|
|
3
|
+
use std::sync::{Arc, Mutex};
|
|
4
|
+
use tracing::Subscriber;
|
|
5
|
+
use tracing::span::{Attributes, Id};
|
|
6
|
+
use tracing_subscriber::Layer;
|
|
7
|
+
use tracing_subscriber::layer::{Context, SubscriberExt};
|
|
8
|
+
use tracing_subscriber::registry::LookupSpan;
|
|
9
|
+
|
|
10
|
+
/// Simple span name collector for testing.
|
|
11
|
+
///
|
|
12
|
+
/// This layer collects span names as they are created to verify
|
|
13
|
+
/// that instrumentation is working correctly.
|
|
14
|
+
struct SpanCollector {
|
|
15
|
+
spans: Arc<Mutex<Vec<String>>>,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
impl<S: Subscriber + for<'a> LookupSpan<'a>> Layer<S> for SpanCollector {
|
|
19
|
+
fn on_new_span(&self, attrs: &Attributes<'_>, _id: &Id, _ctx: Context<'_, S>) {
|
|
20
|
+
self.spans.lock().unwrap().push(attrs.metadata().name().to_string());
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
#[tokio::test]
|
|
25
|
+
async fn test_cache_instrumentation() {
|
|
26
|
+
use kreuzberg::cache::GenericCache;
|
|
27
|
+
use tempfile::tempdir;
|
|
28
|
+
|
|
29
|
+
let spans = Arc::new(Mutex::new(Vec::new()));
|
|
30
|
+
let collector = SpanCollector { spans: spans.clone() };
|
|
31
|
+
|
|
32
|
+
let subscriber = tracing_subscriber::registry().with(collector);
|
|
33
|
+
let _guard = tracing::subscriber::set_default(subscriber);
|
|
34
|
+
|
|
35
|
+
let temp_dir = tempdir().unwrap();
|
|
36
|
+
let cache = GenericCache::new(
|
|
37
|
+
"test".to_string(),
|
|
38
|
+
Some(temp_dir.path().to_str().unwrap().to_string()),
|
|
39
|
+
30.0,
|
|
40
|
+
500.0,
|
|
41
|
+
1000.0,
|
|
42
|
+
)
|
|
43
|
+
.unwrap();
|
|
44
|
+
|
|
45
|
+
cache.set("test_key", b"test data".to_vec(), None).unwrap();
|
|
46
|
+
|
|
47
|
+
let _ = cache.get("test_key", None).unwrap();
|
|
48
|
+
|
|
49
|
+
let span_names = spans.lock().unwrap();
|
|
50
|
+
assert!(span_names.contains(&"set".to_string()), "Expected 'set' span");
|
|
51
|
+
assert!(span_names.contains(&"get".to_string()), "Expected 'get' span");
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
#[cfg(feature = "ocr")]
|
|
55
|
+
#[tokio::test]
|
|
56
|
+
async fn test_ocr_instrumentation() {
|
|
57
|
+
use kreuzberg::ocr::processor::OcrProcessor;
|
|
58
|
+
use kreuzberg::ocr::types::TesseractConfig;
|
|
59
|
+
use tempfile::tempdir;
|
|
60
|
+
|
|
61
|
+
let spans = Arc::new(Mutex::new(Vec::new()));
|
|
62
|
+
let collector = SpanCollector { spans: spans.clone() };
|
|
63
|
+
|
|
64
|
+
let subscriber = tracing_subscriber::registry().with(collector);
|
|
65
|
+
let _guard = tracing::subscriber::set_default(subscriber);
|
|
66
|
+
|
|
67
|
+
let temp_dir = tempdir().unwrap();
|
|
68
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
|
|
69
|
+
|
|
70
|
+
let mut test_image = Vec::new();
|
|
71
|
+
let img = image::ImageBuffer::from_fn(1, 1, |_, _| image::Rgb([255u8, 255u8, 255u8]));
|
|
72
|
+
img.write_to(&mut std::io::Cursor::new(&mut test_image), image::ImageFormat::Png)
|
|
73
|
+
.unwrap();
|
|
74
|
+
|
|
75
|
+
let config = TesseractConfig {
|
|
76
|
+
output_format: "text".to_string(),
|
|
77
|
+
use_cache: false,
|
|
78
|
+
..TesseractConfig::default()
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
let _ = processor.process_image(&test_image, &config);
|
|
82
|
+
|
|
83
|
+
let span_names = spans.lock().unwrap();
|
|
84
|
+
assert!(
|
|
85
|
+
span_names.contains(&"process_image".to_string()),
|
|
86
|
+
"Expected 'process_image' span"
|
|
87
|
+
);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#[tokio::test]
|
|
91
|
+
async fn test_registry_instrumentation() {
|
|
92
|
+
use kreuzberg::plugins::registry::DocumentExtractorRegistry;
|
|
93
|
+
|
|
94
|
+
let spans = Arc::new(Mutex::new(Vec::new()));
|
|
95
|
+
let collector = SpanCollector { spans: spans.clone() };
|
|
96
|
+
|
|
97
|
+
let subscriber = tracing_subscriber::registry().with(collector);
|
|
98
|
+
let _guard = tracing::subscriber::set_default(subscriber);
|
|
99
|
+
|
|
100
|
+
let registry = DocumentExtractorRegistry::new();
|
|
101
|
+
|
|
102
|
+
let _ = registry.get("application/pdf");
|
|
103
|
+
|
|
104
|
+
let span_names = spans.lock().unwrap();
|
|
105
|
+
assert!(
|
|
106
|
+
span_names.contains(&"get".to_string()),
|
|
107
|
+
"Expected 'get' span from registry"
|
|
108
|
+
);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
#[cfg(all(feature = "pdf", feature = "office"))]
|
|
112
|
+
#[tokio::test]
|
|
113
|
+
async fn test_span_hierarchy() {
|
|
114
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
115
|
+
use kreuzberg::core::extractor::extract_bytes;
|
|
116
|
+
|
|
117
|
+
let spans = Arc::new(Mutex::new(Vec::new()));
|
|
118
|
+
let collector = SpanCollector { spans: spans.clone() };
|
|
119
|
+
|
|
120
|
+
let subscriber = tracing_subscriber::registry().with(collector);
|
|
121
|
+
let _guard = tracing::subscriber::set_default(subscriber);
|
|
122
|
+
|
|
123
|
+
let test_content = b"Hello, World!";
|
|
124
|
+
let config = ExtractionConfig::default();
|
|
125
|
+
|
|
126
|
+
let _ = extract_bytes(test_content, "text/plain", &config).await;
|
|
127
|
+
|
|
128
|
+
let span_names = spans.lock().unwrap();
|
|
129
|
+
assert!(
|
|
130
|
+
span_names.contains(&"extract_bytes".to_string()),
|
|
131
|
+
"Expected 'extract_bytes' span"
|
|
132
|
+
);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
#[test]
|
|
136
|
+
fn test_span_collector_creation() {
|
|
137
|
+
let spans = Arc::new(Mutex::new(Vec::new()));
|
|
138
|
+
let _collector = SpanCollector { spans };
|
|
139
|
+
}
|