kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +2 -105
- data/README.md +454 -454
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -1
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +7 -80
|
@@ -1,491 +1,491 @@
|
|
|
1
|
-
//! Image extraction functionality.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides functions for extracting metadata and EXIF data from images,
|
|
4
|
-
//! including support for multi-frame TIFF files.
|
|
5
|
-
|
|
6
|
-
use crate::error::{KreuzbergError, Result};
|
|
7
|
-
use exif::{In, Reader, Tag};
|
|
8
|
-
use image::ImageReader;
|
|
9
|
-
use std::collections::HashMap;
|
|
10
|
-
use std::io::Cursor;
|
|
11
|
-
|
|
12
|
-
/// Image metadata extracted from an image file.
|
|
13
|
-
#[derive(Debug, Clone)]
|
|
14
|
-
pub struct ImageMetadata {
|
|
15
|
-
/// Image width in pixels
|
|
16
|
-
pub width: u32,
|
|
17
|
-
/// Image height in pixels
|
|
18
|
-
pub height: u32,
|
|
19
|
-
/// Image format (e.g., "PNG", "JPEG")
|
|
20
|
-
pub format: String,
|
|
21
|
-
/// EXIF data if available
|
|
22
|
-
pub exif_data: HashMap<String, String>,
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
/// Extract metadata from image bytes.
|
|
26
|
-
///
|
|
27
|
-
/// Extracts dimensions, format, and EXIF data from the image.
|
|
28
|
-
pub fn extract_image_metadata(bytes: &[u8]) -> Result<ImageMetadata> {
|
|
29
|
-
let reader = ImageReader::new(Cursor::new(bytes))
|
|
30
|
-
.with_guessed_format()
|
|
31
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read image format: {}", e)))?;
|
|
32
|
-
|
|
33
|
-
let format = reader
|
|
34
|
-
.format()
|
|
35
|
-
.ok_or_else(|| KreuzbergError::parsing("Could not determine image format".to_string()))?;
|
|
36
|
-
|
|
37
|
-
let image = reader
|
|
38
|
-
.decode()
|
|
39
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to decode image: {}", e)))?;
|
|
40
|
-
|
|
41
|
-
let width = image.width();
|
|
42
|
-
let height = image.height();
|
|
43
|
-
let format_str = format!("{:?}", format).to_uppercase();
|
|
44
|
-
|
|
45
|
-
let exif_data = extract_exif_data(bytes);
|
|
46
|
-
|
|
47
|
-
Ok(ImageMetadata {
|
|
48
|
-
width,
|
|
49
|
-
height,
|
|
50
|
-
format: format_str,
|
|
51
|
-
exif_data,
|
|
52
|
-
})
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
/// Extract EXIF data from image bytes.
|
|
56
|
-
///
|
|
57
|
-
/// Returns a HashMap of EXIF tags and their values.
|
|
58
|
-
/// If EXIF data is not available or cannot be parsed, returns an empty HashMap.
|
|
59
|
-
fn extract_exif_data(bytes: &[u8]) -> HashMap<String, String> {
|
|
60
|
-
let mut exif_map = HashMap::new();
|
|
61
|
-
|
|
62
|
-
let exif_reader = match Reader::new().read_from_container(&mut Cursor::new(bytes)) {
|
|
63
|
-
Ok(reader) => reader,
|
|
64
|
-
Err(_) => return exif_map,
|
|
65
|
-
};
|
|
66
|
-
|
|
67
|
-
let common_tags = [
|
|
68
|
-
(Tag::Make, "Make"),
|
|
69
|
-
(Tag::Model, "Model"),
|
|
70
|
-
(Tag::DateTime, "DateTime"),
|
|
71
|
-
(Tag::DateTimeOriginal, "DateTimeOriginal"),
|
|
72
|
-
(Tag::DateTimeDigitized, "DateTimeDigitized"),
|
|
73
|
-
(Tag::Software, "Software"),
|
|
74
|
-
(Tag::Orientation, "Orientation"),
|
|
75
|
-
(Tag::XResolution, "XResolution"),
|
|
76
|
-
(Tag::YResolution, "YResolution"),
|
|
77
|
-
(Tag::ResolutionUnit, "ResolutionUnit"),
|
|
78
|
-
(Tag::ExposureTime, "ExposureTime"),
|
|
79
|
-
(Tag::FNumber, "FNumber"),
|
|
80
|
-
(Tag::PhotographicSensitivity, "ISO"),
|
|
81
|
-
(Tag::FocalLength, "FocalLength"),
|
|
82
|
-
(Tag::Flash, "Flash"),
|
|
83
|
-
(Tag::WhiteBalance, "WhiteBalance"),
|
|
84
|
-
(Tag::GPSLatitude, "GPSLatitude"),
|
|
85
|
-
(Tag::GPSLongitude, "GPSLongitude"),
|
|
86
|
-
(Tag::GPSAltitude, "GPSAltitude"),
|
|
87
|
-
];
|
|
88
|
-
|
|
89
|
-
for (tag, field_name) in common_tags {
|
|
90
|
-
if let Some(field) = exif_reader.get_field(tag, In::PRIMARY) {
|
|
91
|
-
exif_map.insert(field_name.to_string(), field.display_value().to_string());
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
exif_map
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
/// Result of OCR extraction from an image with optional page tracking.
|
|
99
|
-
#[derive(Debug, Clone)]
|
|
100
|
-
pub struct ImageOcrResult {
|
|
101
|
-
/// Extracted text content
|
|
102
|
-
pub content: String,
|
|
103
|
-
/// Character byte boundaries per frame (for multi-frame TIFFs)
|
|
104
|
-
pub boundaries: Option<Vec<crate::types::PageBoundary>>,
|
|
105
|
-
/// Per-frame content information
|
|
106
|
-
pub page_contents: Option<Vec<crate::types::PageContent>>,
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
/// Detects the number of frames in a TIFF file.
|
|
110
|
-
///
|
|
111
|
-
/// Returns the count of image frames/pages in a TIFF. Single-frame TIFFs return 1.
|
|
112
|
-
/// Invalid or non-TIFF data returns an error.
|
|
113
|
-
///
|
|
114
|
-
/// # Arguments
|
|
115
|
-
/// * `bytes` - Raw TIFF file bytes
|
|
116
|
-
///
|
|
117
|
-
/// # Returns
|
|
118
|
-
/// Frame count if valid TIFF, error otherwise.
|
|
119
|
-
#[cfg(feature = "ocr")]
|
|
120
|
-
fn detect_tiff_frame_count(bytes: &[u8]) -> Result<usize> {
|
|
121
|
-
use tiff::decoder::Decoder;
|
|
122
|
-
let mut decoder =
|
|
123
|
-
Decoder::new(Cursor::new(bytes)).map_err(|e| KreuzbergError::parsing(format!("TIFF decode: {}", e)))?;
|
|
124
|
-
|
|
125
|
-
let mut count = 1;
|
|
126
|
-
while decoder.next_image().is_ok() {
|
|
127
|
-
count += 1;
|
|
128
|
-
}
|
|
129
|
-
Ok(count)
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
/// Extract text from image bytes using OCR with optional page tracking for multi-frame TIFFs.
|
|
133
|
-
///
|
|
134
|
-
/// This function:
|
|
135
|
-
/// - Detects if the image is a multi-frame TIFF
|
|
136
|
-
/// - For multi-frame TIFFs with PageConfig enabled, iterates frames and tracks boundaries
|
|
137
|
-
/// - For single-frame images or when page tracking is disabled, runs OCR on the whole image
|
|
138
|
-
/// - Returns (content, boundaries, page_contents) tuple
|
|
139
|
-
///
|
|
140
|
-
/// # Arguments
|
|
141
|
-
/// * `bytes` - Image file bytes
|
|
142
|
-
/// * `mime_type` - MIME type (e.g., "image/tiff")
|
|
143
|
-
/// * `ocr_result` - OCR backend result containing the text
|
|
144
|
-
/// * `page_config` - Optional page configuration for boundary tracking
|
|
145
|
-
///
|
|
146
|
-
/// # Returns
|
|
147
|
-
/// ImageOcrResult with content and optional boundaries for pagination
|
|
148
|
-
#[cfg(feature = "ocr")]
|
|
149
|
-
pub fn extract_text_from_image_with_ocr(
|
|
150
|
-
bytes: &[u8],
|
|
151
|
-
mime_type: &str,
|
|
152
|
-
ocr_result: String,
|
|
153
|
-
page_config: Option<&crate::core::config::PageConfig>,
|
|
154
|
-
) -> Result<ImageOcrResult> {
|
|
155
|
-
let is_tiff = mime_type.to_lowercase().contains("tiff");
|
|
156
|
-
let should_track_pages = page_config.is_some() && is_tiff;
|
|
157
|
-
|
|
158
|
-
if !should_track_pages {
|
|
159
|
-
return Ok(ImageOcrResult {
|
|
160
|
-
content: ocr_result,
|
|
161
|
-
boundaries: None,
|
|
162
|
-
page_contents: None,
|
|
163
|
-
});
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
let frame_count = detect_tiff_frame_count(bytes)?;
|
|
167
|
-
|
|
168
|
-
if frame_count <= 1 {
|
|
169
|
-
return Ok(ImageOcrResult {
|
|
170
|
-
content: ocr_result,
|
|
171
|
-
boundaries: None,
|
|
172
|
-
page_contents: None,
|
|
173
|
-
});
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
let content_len = ocr_result.len();
|
|
177
|
-
let content_per_frame = if frame_count > 0 {
|
|
178
|
-
content_len / frame_count
|
|
179
|
-
} else {
|
|
180
|
-
content_len
|
|
181
|
-
};
|
|
182
|
-
|
|
183
|
-
let mut boundaries = Vec::new();
|
|
184
|
-
let mut page_contents = Vec::new();
|
|
185
|
-
let mut byte_offset = 0;
|
|
186
|
-
|
|
187
|
-
for frame_num in 1..=frame_count {
|
|
188
|
-
let frame_end = if frame_num == frame_count {
|
|
189
|
-
content_len
|
|
190
|
-
} else {
|
|
191
|
-
let raw_end = (frame_num * content_per_frame).min(content_len);
|
|
192
|
-
(raw_end..=content_len)
|
|
193
|
-
.find(|&i| ocr_result.is_char_boundary(i))
|
|
194
|
-
.unwrap_or(content_len)
|
|
195
|
-
};
|
|
196
|
-
|
|
197
|
-
boundaries.push(crate::types::PageBoundary {
|
|
198
|
-
byte_start: byte_offset,
|
|
199
|
-
byte_end: frame_end,
|
|
200
|
-
page_number: frame_num,
|
|
201
|
-
});
|
|
202
|
-
|
|
203
|
-
page_contents.push(crate::types::PageContent {
|
|
204
|
-
page_number: frame_num,
|
|
205
|
-
content: ocr_result[byte_offset..frame_end].to_string(),
|
|
206
|
-
tables: vec![],
|
|
207
|
-
images: vec![],
|
|
208
|
-
});
|
|
209
|
-
|
|
210
|
-
byte_offset = frame_end;
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
Ok(ImageOcrResult {
|
|
214
|
-
content: ocr_result,
|
|
215
|
-
boundaries: Some(boundaries),
|
|
216
|
-
page_contents: Some(page_contents),
|
|
217
|
-
})
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
#[cfg(test)]
|
|
221
|
-
mod tests {
|
|
222
|
-
use super::*;
|
|
223
|
-
use image::{ImageBuffer, ImageFormat, Rgb, RgbImage};
|
|
224
|
-
use std::io::Cursor;
|
|
225
|
-
|
|
226
|
-
fn create_test_image(width: u32, height: u32, format: ImageFormat) -> Vec<u8> {
|
|
227
|
-
let img: RgbImage = ImageBuffer::from_fn(width, height, |x, y| {
|
|
228
|
-
let r = ((x as f32 / width as f32) * 255.0) as u8;
|
|
229
|
-
let g = ((y as f32 / height as f32) * 255.0) as u8;
|
|
230
|
-
let b = 128;
|
|
231
|
-
Rgb([r, g, b])
|
|
232
|
-
});
|
|
233
|
-
|
|
234
|
-
let mut bytes: Vec<u8> = Vec::new();
|
|
235
|
-
let mut cursor = Cursor::new(&mut bytes);
|
|
236
|
-
img.write_to(&mut cursor, format).unwrap();
|
|
237
|
-
bytes
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
#[test]
|
|
241
|
-
fn test_extract_png_image_returns_correct_metadata() {
|
|
242
|
-
let bytes = create_test_image(100, 80, ImageFormat::Png);
|
|
243
|
-
let result = extract_image_metadata(&bytes);
|
|
244
|
-
|
|
245
|
-
assert!(result.is_ok());
|
|
246
|
-
let metadata = result.unwrap();
|
|
247
|
-
assert_eq!(metadata.width, 100);
|
|
248
|
-
assert_eq!(metadata.height, 80);
|
|
249
|
-
assert_eq!(metadata.format, "PNG");
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
#[test]
|
|
253
|
-
fn test_extract_jpeg_image_returns_correct_metadata() {
|
|
254
|
-
let bytes = create_test_image(200, 150, ImageFormat::Jpeg);
|
|
255
|
-
let result = extract_image_metadata(&bytes);
|
|
256
|
-
|
|
257
|
-
assert!(result.is_ok());
|
|
258
|
-
let metadata = result.unwrap();
|
|
259
|
-
assert_eq!(metadata.width, 200);
|
|
260
|
-
assert_eq!(metadata.height, 150);
|
|
261
|
-
assert_eq!(metadata.format, "JPEG");
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
#[test]
|
|
265
|
-
fn test_extract_webp_image_returns_correct_metadata() {
|
|
266
|
-
let bytes = create_test_image(120, 90, ImageFormat::WebP);
|
|
267
|
-
let result = extract_image_metadata(&bytes);
|
|
268
|
-
|
|
269
|
-
assert!(result.is_ok());
|
|
270
|
-
let metadata = result.unwrap();
|
|
271
|
-
assert_eq!(metadata.width, 120);
|
|
272
|
-
assert_eq!(metadata.height, 90);
|
|
273
|
-
assert_eq!(metadata.format, "WEBP");
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
#[test]
|
|
277
|
-
fn test_extract_bmp_image_returns_correct_metadata() {
|
|
278
|
-
let bytes = create_test_image(50, 50, ImageFormat::Bmp);
|
|
279
|
-
let result = extract_image_metadata(&bytes);
|
|
280
|
-
|
|
281
|
-
assert!(result.is_ok());
|
|
282
|
-
let metadata = result.unwrap();
|
|
283
|
-
assert_eq!(metadata.width, 50);
|
|
284
|
-
assert_eq!(metadata.height, 50);
|
|
285
|
-
assert_eq!(metadata.format, "BMP");
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
#[test]
|
|
289
|
-
fn test_extract_tiff_image_returns_correct_metadata() {
|
|
290
|
-
let bytes = create_test_image(180, 120, ImageFormat::Tiff);
|
|
291
|
-
let result = extract_image_metadata(&bytes);
|
|
292
|
-
|
|
293
|
-
assert!(result.is_ok());
|
|
294
|
-
let metadata = result.unwrap();
|
|
295
|
-
assert_eq!(metadata.width, 180);
|
|
296
|
-
assert_eq!(metadata.height, 120);
|
|
297
|
-
assert_eq!(metadata.format, "TIFF");
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
#[test]
|
|
301
|
-
fn test_extract_gif_image_returns_correct_metadata() {
|
|
302
|
-
let bytes = create_test_image(64, 64, ImageFormat::Gif);
|
|
303
|
-
let result = extract_image_metadata(&bytes);
|
|
304
|
-
|
|
305
|
-
assert!(result.is_ok());
|
|
306
|
-
let metadata = result.unwrap();
|
|
307
|
-
assert_eq!(metadata.width, 64);
|
|
308
|
-
assert_eq!(metadata.height, 64);
|
|
309
|
-
assert_eq!(metadata.format, "GIF");
|
|
310
|
-
}
|
|
311
|
-
|
|
312
|
-
#[test]
|
|
313
|
-
fn test_extract_image_extreme_aspect_ratio() {
|
|
314
|
-
let bytes = create_test_image(1000, 10, ImageFormat::Png);
|
|
315
|
-
let result = extract_image_metadata(&bytes);
|
|
316
|
-
|
|
317
|
-
assert!(result.is_ok());
|
|
318
|
-
let metadata = result.unwrap();
|
|
319
|
-
assert_eq!(metadata.width, 1000);
|
|
320
|
-
assert_eq!(metadata.height, 10);
|
|
321
|
-
assert!(metadata.width / metadata.height >= 100);
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
#[test]
|
|
325
|
-
fn test_extract_image_dimensions_correctly() {
|
|
326
|
-
let bytes = create_test_image(640, 480, ImageFormat::Png);
|
|
327
|
-
let result = extract_image_metadata(&bytes);
|
|
328
|
-
|
|
329
|
-
assert!(result.is_ok());
|
|
330
|
-
let metadata = result.unwrap();
|
|
331
|
-
assert_eq!(metadata.width, 640);
|
|
332
|
-
assert_eq!(metadata.height, 480);
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
#[test]
|
|
336
|
-
fn test_extract_image_format_correctly() {
|
|
337
|
-
let png_bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
338
|
-
let jpeg_bytes = create_test_image(100, 100, ImageFormat::Jpeg);
|
|
339
|
-
|
|
340
|
-
let png_metadata = extract_image_metadata(&png_bytes).unwrap();
|
|
341
|
-
let jpeg_metadata = extract_image_metadata(&jpeg_bytes).unwrap();
|
|
342
|
-
|
|
343
|
-
assert_eq!(png_metadata.format, "PNG");
|
|
344
|
-
assert_eq!(jpeg_metadata.format, "JPEG");
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
#[test]
|
|
348
|
-
fn test_extract_image_without_exif_returns_empty_map() {
|
|
349
|
-
let bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
350
|
-
let result = extract_image_metadata(&bytes);
|
|
351
|
-
|
|
352
|
-
assert!(result.is_ok());
|
|
353
|
-
let metadata = result.unwrap();
|
|
354
|
-
assert!(metadata.exif_data.is_empty());
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
#[test]
|
|
358
|
-
fn test_extract_exif_data_from_jpeg_with_exif() {
|
|
359
|
-
let bytes = create_test_image(100, 100, ImageFormat::Jpeg);
|
|
360
|
-
let result = extract_image_metadata(&bytes);
|
|
361
|
-
|
|
362
|
-
assert!(result.is_ok());
|
|
363
|
-
let metadata = result.unwrap();
|
|
364
|
-
assert_eq!(metadata.exif_data.len(), 0);
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
#[test]
|
|
368
|
-
fn test_extract_image_metadata_invalid_returns_error() {
|
|
369
|
-
let invalid_bytes = vec![0, 1, 2, 3, 4, 5];
|
|
370
|
-
let result = extract_image_metadata(&invalid_bytes);
|
|
371
|
-
assert!(result.is_err());
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
#[test]
|
|
375
|
-
fn test_extract_image_corrupted_data_returns_error() {
|
|
376
|
-
let mut bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
377
|
-
if bytes.len() > 50 {
|
|
378
|
-
for byte in bytes.iter_mut().take(50).skip(20) {
|
|
379
|
-
*byte = 0xFF;
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
let result = extract_image_metadata(&bytes);
|
|
384
|
-
assert!(result.is_ok() || result.is_err());
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
#[test]
|
|
388
|
-
fn test_extract_image_empty_bytes_returns_error() {
|
|
389
|
-
let empty_bytes: Vec<u8> = Vec::new();
|
|
390
|
-
let result = extract_image_metadata(&empty_bytes);
|
|
391
|
-
assert!(result.is_err());
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
#[test]
|
|
395
|
-
fn test_extract_image_unsupported_format_returns_error() {
|
|
396
|
-
let unsupported_bytes = vec![0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87, 0x0A];
|
|
397
|
-
let result = extract_image_metadata(&unsupported_bytes);
|
|
398
|
-
assert!(result.is_err());
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
#[test]
|
|
402
|
-
fn test_extract_very_small_image_1x1_pixel() {
|
|
403
|
-
let bytes = create_test_image(1, 1, ImageFormat::Png);
|
|
404
|
-
let result = extract_image_metadata(&bytes);
|
|
405
|
-
|
|
406
|
-
assert!(result.is_ok());
|
|
407
|
-
let metadata = result.unwrap();
|
|
408
|
-
assert_eq!(metadata.width, 1);
|
|
409
|
-
assert_eq!(metadata.height, 1);
|
|
410
|
-
assert_eq!(metadata.format, "PNG");
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
#[test]
|
|
414
|
-
fn test_extract_large_image_dimensions() {
|
|
415
|
-
let bytes = create_test_image(2048, 1536, ImageFormat::Png);
|
|
416
|
-
let result = extract_image_metadata(&bytes);
|
|
417
|
-
|
|
418
|
-
assert!(result.is_ok());
|
|
419
|
-
let metadata = result.unwrap();
|
|
420
|
-
assert_eq!(metadata.width, 2048);
|
|
421
|
-
assert_eq!(metadata.height, 1536);
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
#[test]
|
|
425
|
-
fn test_extract_image_with_no_metadata_has_empty_exif() {
|
|
426
|
-
let bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
427
|
-
let result = extract_image_metadata(&bytes);
|
|
428
|
-
|
|
429
|
-
assert!(result.is_ok());
|
|
430
|
-
let metadata = result.unwrap();
|
|
431
|
-
assert!(metadata.exif_data.is_empty());
|
|
432
|
-
}
|
|
433
|
-
|
|
434
|
-
#[test]
|
|
435
|
-
fn test_extract_exif_data_returns_empty_map_for_non_jpeg() {
|
|
436
|
-
let png_bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
437
|
-
let exif_data = extract_exif_data(&png_bytes);
|
|
438
|
-
assert!(exif_data.is_empty());
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
#[test]
|
|
442
|
-
fn test_extract_rectangular_image_portrait_orientation() {
|
|
443
|
-
let bytes = create_test_image(400, 800, ImageFormat::Jpeg);
|
|
444
|
-
let result = extract_image_metadata(&bytes);
|
|
445
|
-
|
|
446
|
-
assert!(result.is_ok());
|
|
447
|
-
let metadata = result.unwrap();
|
|
448
|
-
assert_eq!(metadata.width, 400);
|
|
449
|
-
assert_eq!(metadata.height, 800);
|
|
450
|
-
assert!(metadata.height > metadata.width);
|
|
451
|
-
}
|
|
452
|
-
|
|
453
|
-
#[test]
|
|
454
|
-
fn test_extract_rectangular_image_landscape_orientation() {
|
|
455
|
-
let bytes = create_test_image(800, 400, ImageFormat::Png);
|
|
456
|
-
let result = extract_image_metadata(&bytes);
|
|
457
|
-
|
|
458
|
-
assert!(result.is_ok());
|
|
459
|
-
let metadata = result.unwrap();
|
|
460
|
-
assert_eq!(metadata.width, 800);
|
|
461
|
-
assert_eq!(metadata.height, 400);
|
|
462
|
-
assert!(metadata.width > metadata.height);
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
#[test]
|
|
466
|
-
fn test_extract_square_image_equal_dimensions() {
|
|
467
|
-
let bytes = create_test_image(512, 512, ImageFormat::Png);
|
|
468
|
-
let result = extract_image_metadata(&bytes);
|
|
469
|
-
|
|
470
|
-
assert!(result.is_ok());
|
|
471
|
-
let metadata = result.unwrap();
|
|
472
|
-
assert_eq!(metadata.width, 512);
|
|
473
|
-
assert_eq!(metadata.height, 512);
|
|
474
|
-
assert_eq!(metadata.width, metadata.height);
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
#[test]
|
|
478
|
-
fn test_extract_metadata_preserves_format_case() {
|
|
479
|
-
let png_bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
480
|
-
let jpeg_bytes = create_test_image(100, 100, ImageFormat::Jpeg);
|
|
481
|
-
let webp_bytes = create_test_image(100, 100, ImageFormat::WebP);
|
|
482
|
-
|
|
483
|
-
let png_meta = extract_image_metadata(&png_bytes).unwrap();
|
|
484
|
-
let jpeg_meta = extract_image_metadata(&jpeg_bytes).unwrap();
|
|
485
|
-
let webp_meta = extract_image_metadata(&webp_bytes).unwrap();
|
|
486
|
-
|
|
487
|
-
assert_eq!(png_meta.format, "PNG");
|
|
488
|
-
assert_eq!(jpeg_meta.format, "JPEG");
|
|
489
|
-
assert_eq!(webp_meta.format, "WEBP");
|
|
490
|
-
}
|
|
491
|
-
}
|
|
1
|
+
//! Image extraction functionality.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides functions for extracting metadata and EXIF data from images,
|
|
4
|
+
//! including support for multi-frame TIFF files.
|
|
5
|
+
|
|
6
|
+
use crate::error::{KreuzbergError, Result};
|
|
7
|
+
use exif::{In, Reader, Tag};
|
|
8
|
+
use image::ImageReader;
|
|
9
|
+
use std::collections::HashMap;
|
|
10
|
+
use std::io::Cursor;
|
|
11
|
+
|
|
12
|
+
/// Image metadata extracted from an image file.
|
|
13
|
+
#[derive(Debug, Clone)]
|
|
14
|
+
pub struct ImageMetadata {
|
|
15
|
+
/// Image width in pixels
|
|
16
|
+
pub width: u32,
|
|
17
|
+
/// Image height in pixels
|
|
18
|
+
pub height: u32,
|
|
19
|
+
/// Image format (e.g., "PNG", "JPEG")
|
|
20
|
+
pub format: String,
|
|
21
|
+
/// EXIF data if available
|
|
22
|
+
pub exif_data: HashMap<String, String>,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/// Extract metadata from image bytes.
|
|
26
|
+
///
|
|
27
|
+
/// Extracts dimensions, format, and EXIF data from the image.
|
|
28
|
+
pub fn extract_image_metadata(bytes: &[u8]) -> Result<ImageMetadata> {
|
|
29
|
+
let reader = ImageReader::new(Cursor::new(bytes))
|
|
30
|
+
.with_guessed_format()
|
|
31
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read image format: {}", e)))?;
|
|
32
|
+
|
|
33
|
+
let format = reader
|
|
34
|
+
.format()
|
|
35
|
+
.ok_or_else(|| KreuzbergError::parsing("Could not determine image format".to_string()))?;
|
|
36
|
+
|
|
37
|
+
let image = reader
|
|
38
|
+
.decode()
|
|
39
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to decode image: {}", e)))?;
|
|
40
|
+
|
|
41
|
+
let width = image.width();
|
|
42
|
+
let height = image.height();
|
|
43
|
+
let format_str = format!("{:?}", format).to_uppercase();
|
|
44
|
+
|
|
45
|
+
let exif_data = extract_exif_data(bytes);
|
|
46
|
+
|
|
47
|
+
Ok(ImageMetadata {
|
|
48
|
+
width,
|
|
49
|
+
height,
|
|
50
|
+
format: format_str,
|
|
51
|
+
exif_data,
|
|
52
|
+
})
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/// Extract EXIF data from image bytes.
|
|
56
|
+
///
|
|
57
|
+
/// Returns a HashMap of EXIF tags and their values.
|
|
58
|
+
/// If EXIF data is not available or cannot be parsed, returns an empty HashMap.
|
|
59
|
+
fn extract_exif_data(bytes: &[u8]) -> HashMap<String, String> {
|
|
60
|
+
let mut exif_map = HashMap::new();
|
|
61
|
+
|
|
62
|
+
let exif_reader = match Reader::new().read_from_container(&mut Cursor::new(bytes)) {
|
|
63
|
+
Ok(reader) => reader,
|
|
64
|
+
Err(_) => return exif_map,
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
let common_tags = [
|
|
68
|
+
(Tag::Make, "Make"),
|
|
69
|
+
(Tag::Model, "Model"),
|
|
70
|
+
(Tag::DateTime, "DateTime"),
|
|
71
|
+
(Tag::DateTimeOriginal, "DateTimeOriginal"),
|
|
72
|
+
(Tag::DateTimeDigitized, "DateTimeDigitized"),
|
|
73
|
+
(Tag::Software, "Software"),
|
|
74
|
+
(Tag::Orientation, "Orientation"),
|
|
75
|
+
(Tag::XResolution, "XResolution"),
|
|
76
|
+
(Tag::YResolution, "YResolution"),
|
|
77
|
+
(Tag::ResolutionUnit, "ResolutionUnit"),
|
|
78
|
+
(Tag::ExposureTime, "ExposureTime"),
|
|
79
|
+
(Tag::FNumber, "FNumber"),
|
|
80
|
+
(Tag::PhotographicSensitivity, "ISO"),
|
|
81
|
+
(Tag::FocalLength, "FocalLength"),
|
|
82
|
+
(Tag::Flash, "Flash"),
|
|
83
|
+
(Tag::WhiteBalance, "WhiteBalance"),
|
|
84
|
+
(Tag::GPSLatitude, "GPSLatitude"),
|
|
85
|
+
(Tag::GPSLongitude, "GPSLongitude"),
|
|
86
|
+
(Tag::GPSAltitude, "GPSAltitude"),
|
|
87
|
+
];
|
|
88
|
+
|
|
89
|
+
for (tag, field_name) in common_tags {
|
|
90
|
+
if let Some(field) = exif_reader.get_field(tag, In::PRIMARY) {
|
|
91
|
+
exif_map.insert(field_name.to_string(), field.display_value().to_string());
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
exif_map
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/// Result of OCR extraction from an image with optional page tracking.
|
|
99
|
+
#[derive(Debug, Clone)]
|
|
100
|
+
pub struct ImageOcrResult {
|
|
101
|
+
/// Extracted text content
|
|
102
|
+
pub content: String,
|
|
103
|
+
/// Character byte boundaries per frame (for multi-frame TIFFs)
|
|
104
|
+
pub boundaries: Option<Vec<crate::types::PageBoundary>>,
|
|
105
|
+
/// Per-frame content information
|
|
106
|
+
pub page_contents: Option<Vec<crate::types::PageContent>>,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/// Detects the number of frames in a TIFF file.
|
|
110
|
+
///
|
|
111
|
+
/// Returns the count of image frames/pages in a TIFF. Single-frame TIFFs return 1.
|
|
112
|
+
/// Invalid or non-TIFF data returns an error.
|
|
113
|
+
///
|
|
114
|
+
/// # Arguments
|
|
115
|
+
/// * `bytes` - Raw TIFF file bytes
|
|
116
|
+
///
|
|
117
|
+
/// # Returns
|
|
118
|
+
/// Frame count if valid TIFF, error otherwise.
|
|
119
|
+
#[cfg(feature = "ocr")]
|
|
120
|
+
fn detect_tiff_frame_count(bytes: &[u8]) -> Result<usize> {
|
|
121
|
+
use tiff::decoder::Decoder;
|
|
122
|
+
let mut decoder =
|
|
123
|
+
Decoder::new(Cursor::new(bytes)).map_err(|e| KreuzbergError::parsing(format!("TIFF decode: {}", e)))?;
|
|
124
|
+
|
|
125
|
+
let mut count = 1;
|
|
126
|
+
while decoder.next_image().is_ok() {
|
|
127
|
+
count += 1;
|
|
128
|
+
}
|
|
129
|
+
Ok(count)
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/// Extract text from image bytes using OCR with optional page tracking for multi-frame TIFFs.
|
|
133
|
+
///
|
|
134
|
+
/// This function:
|
|
135
|
+
/// - Detects if the image is a multi-frame TIFF
|
|
136
|
+
/// - For multi-frame TIFFs with PageConfig enabled, iterates frames and tracks boundaries
|
|
137
|
+
/// - For single-frame images or when page tracking is disabled, runs OCR on the whole image
|
|
138
|
+
/// - Returns (content, boundaries, page_contents) tuple
|
|
139
|
+
///
|
|
140
|
+
/// # Arguments
|
|
141
|
+
/// * `bytes` - Image file bytes
|
|
142
|
+
/// * `mime_type` - MIME type (e.g., "image/tiff")
|
|
143
|
+
/// * `ocr_result` - OCR backend result containing the text
|
|
144
|
+
/// * `page_config` - Optional page configuration for boundary tracking
|
|
145
|
+
///
|
|
146
|
+
/// # Returns
|
|
147
|
+
/// ImageOcrResult with content and optional boundaries for pagination
|
|
148
|
+
#[cfg(feature = "ocr")]
|
|
149
|
+
pub fn extract_text_from_image_with_ocr(
|
|
150
|
+
bytes: &[u8],
|
|
151
|
+
mime_type: &str,
|
|
152
|
+
ocr_result: String,
|
|
153
|
+
page_config: Option<&crate::core::config::PageConfig>,
|
|
154
|
+
) -> Result<ImageOcrResult> {
|
|
155
|
+
let is_tiff = mime_type.to_lowercase().contains("tiff");
|
|
156
|
+
let should_track_pages = page_config.is_some() && is_tiff;
|
|
157
|
+
|
|
158
|
+
if !should_track_pages {
|
|
159
|
+
return Ok(ImageOcrResult {
|
|
160
|
+
content: ocr_result,
|
|
161
|
+
boundaries: None,
|
|
162
|
+
page_contents: None,
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
let frame_count = detect_tiff_frame_count(bytes)?;
|
|
167
|
+
|
|
168
|
+
if frame_count <= 1 {
|
|
169
|
+
return Ok(ImageOcrResult {
|
|
170
|
+
content: ocr_result,
|
|
171
|
+
boundaries: None,
|
|
172
|
+
page_contents: None,
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
let content_len = ocr_result.len();
|
|
177
|
+
let content_per_frame = if frame_count > 0 {
|
|
178
|
+
content_len / frame_count
|
|
179
|
+
} else {
|
|
180
|
+
content_len
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
let mut boundaries = Vec::new();
|
|
184
|
+
let mut page_contents = Vec::new();
|
|
185
|
+
let mut byte_offset = 0;
|
|
186
|
+
|
|
187
|
+
for frame_num in 1..=frame_count {
|
|
188
|
+
let frame_end = if frame_num == frame_count {
|
|
189
|
+
content_len
|
|
190
|
+
} else {
|
|
191
|
+
let raw_end = (frame_num * content_per_frame).min(content_len);
|
|
192
|
+
(raw_end..=content_len)
|
|
193
|
+
.find(|&i| ocr_result.is_char_boundary(i))
|
|
194
|
+
.unwrap_or(content_len)
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
boundaries.push(crate::types::PageBoundary {
|
|
198
|
+
byte_start: byte_offset,
|
|
199
|
+
byte_end: frame_end,
|
|
200
|
+
page_number: frame_num,
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
page_contents.push(crate::types::PageContent {
|
|
204
|
+
page_number: frame_num,
|
|
205
|
+
content: ocr_result[byte_offset..frame_end].to_string(),
|
|
206
|
+
tables: vec![],
|
|
207
|
+
images: vec![],
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
byte_offset = frame_end;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
Ok(ImageOcrResult {
|
|
214
|
+
content: ocr_result,
|
|
215
|
+
boundaries: Some(boundaries),
|
|
216
|
+
page_contents: Some(page_contents),
|
|
217
|
+
})
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
#[cfg(test)]
|
|
221
|
+
mod tests {
|
|
222
|
+
use super::*;
|
|
223
|
+
use image::{ImageBuffer, ImageFormat, Rgb, RgbImage};
|
|
224
|
+
use std::io::Cursor;
|
|
225
|
+
|
|
226
|
+
fn create_test_image(width: u32, height: u32, format: ImageFormat) -> Vec<u8> {
|
|
227
|
+
let img: RgbImage = ImageBuffer::from_fn(width, height, |x, y| {
|
|
228
|
+
let r = ((x as f32 / width as f32) * 255.0) as u8;
|
|
229
|
+
let g = ((y as f32 / height as f32) * 255.0) as u8;
|
|
230
|
+
let b = 128;
|
|
231
|
+
Rgb([r, g, b])
|
|
232
|
+
});
|
|
233
|
+
|
|
234
|
+
let mut bytes: Vec<u8> = Vec::new();
|
|
235
|
+
let mut cursor = Cursor::new(&mut bytes);
|
|
236
|
+
img.write_to(&mut cursor, format).unwrap();
|
|
237
|
+
bytes
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
#[test]
|
|
241
|
+
fn test_extract_png_image_returns_correct_metadata() {
|
|
242
|
+
let bytes = create_test_image(100, 80, ImageFormat::Png);
|
|
243
|
+
let result = extract_image_metadata(&bytes);
|
|
244
|
+
|
|
245
|
+
assert!(result.is_ok());
|
|
246
|
+
let metadata = result.unwrap();
|
|
247
|
+
assert_eq!(metadata.width, 100);
|
|
248
|
+
assert_eq!(metadata.height, 80);
|
|
249
|
+
assert_eq!(metadata.format, "PNG");
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
#[test]
|
|
253
|
+
fn test_extract_jpeg_image_returns_correct_metadata() {
|
|
254
|
+
let bytes = create_test_image(200, 150, ImageFormat::Jpeg);
|
|
255
|
+
let result = extract_image_metadata(&bytes);
|
|
256
|
+
|
|
257
|
+
assert!(result.is_ok());
|
|
258
|
+
let metadata = result.unwrap();
|
|
259
|
+
assert_eq!(metadata.width, 200);
|
|
260
|
+
assert_eq!(metadata.height, 150);
|
|
261
|
+
assert_eq!(metadata.format, "JPEG");
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
#[test]
|
|
265
|
+
fn test_extract_webp_image_returns_correct_metadata() {
|
|
266
|
+
let bytes = create_test_image(120, 90, ImageFormat::WebP);
|
|
267
|
+
let result = extract_image_metadata(&bytes);
|
|
268
|
+
|
|
269
|
+
assert!(result.is_ok());
|
|
270
|
+
let metadata = result.unwrap();
|
|
271
|
+
assert_eq!(metadata.width, 120);
|
|
272
|
+
assert_eq!(metadata.height, 90);
|
|
273
|
+
assert_eq!(metadata.format, "WEBP");
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
#[test]
|
|
277
|
+
fn test_extract_bmp_image_returns_correct_metadata() {
|
|
278
|
+
let bytes = create_test_image(50, 50, ImageFormat::Bmp);
|
|
279
|
+
let result = extract_image_metadata(&bytes);
|
|
280
|
+
|
|
281
|
+
assert!(result.is_ok());
|
|
282
|
+
let metadata = result.unwrap();
|
|
283
|
+
assert_eq!(metadata.width, 50);
|
|
284
|
+
assert_eq!(metadata.height, 50);
|
|
285
|
+
assert_eq!(metadata.format, "BMP");
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
#[test]
|
|
289
|
+
fn test_extract_tiff_image_returns_correct_metadata() {
|
|
290
|
+
let bytes = create_test_image(180, 120, ImageFormat::Tiff);
|
|
291
|
+
let result = extract_image_metadata(&bytes);
|
|
292
|
+
|
|
293
|
+
assert!(result.is_ok());
|
|
294
|
+
let metadata = result.unwrap();
|
|
295
|
+
assert_eq!(metadata.width, 180);
|
|
296
|
+
assert_eq!(metadata.height, 120);
|
|
297
|
+
assert_eq!(metadata.format, "TIFF");
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
#[test]
|
|
301
|
+
fn test_extract_gif_image_returns_correct_metadata() {
|
|
302
|
+
let bytes = create_test_image(64, 64, ImageFormat::Gif);
|
|
303
|
+
let result = extract_image_metadata(&bytes);
|
|
304
|
+
|
|
305
|
+
assert!(result.is_ok());
|
|
306
|
+
let metadata = result.unwrap();
|
|
307
|
+
assert_eq!(metadata.width, 64);
|
|
308
|
+
assert_eq!(metadata.height, 64);
|
|
309
|
+
assert_eq!(metadata.format, "GIF");
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
#[test]
|
|
313
|
+
fn test_extract_image_extreme_aspect_ratio() {
|
|
314
|
+
let bytes = create_test_image(1000, 10, ImageFormat::Png);
|
|
315
|
+
let result = extract_image_metadata(&bytes);
|
|
316
|
+
|
|
317
|
+
assert!(result.is_ok());
|
|
318
|
+
let metadata = result.unwrap();
|
|
319
|
+
assert_eq!(metadata.width, 1000);
|
|
320
|
+
assert_eq!(metadata.height, 10);
|
|
321
|
+
assert!(metadata.width / metadata.height >= 100);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
#[test]
|
|
325
|
+
fn test_extract_image_dimensions_correctly() {
|
|
326
|
+
let bytes = create_test_image(640, 480, ImageFormat::Png);
|
|
327
|
+
let result = extract_image_metadata(&bytes);
|
|
328
|
+
|
|
329
|
+
assert!(result.is_ok());
|
|
330
|
+
let metadata = result.unwrap();
|
|
331
|
+
assert_eq!(metadata.width, 640);
|
|
332
|
+
assert_eq!(metadata.height, 480);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
#[test]
|
|
336
|
+
fn test_extract_image_format_correctly() {
|
|
337
|
+
let png_bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
338
|
+
let jpeg_bytes = create_test_image(100, 100, ImageFormat::Jpeg);
|
|
339
|
+
|
|
340
|
+
let png_metadata = extract_image_metadata(&png_bytes).unwrap();
|
|
341
|
+
let jpeg_metadata = extract_image_metadata(&jpeg_bytes).unwrap();
|
|
342
|
+
|
|
343
|
+
assert_eq!(png_metadata.format, "PNG");
|
|
344
|
+
assert_eq!(jpeg_metadata.format, "JPEG");
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
#[test]
|
|
348
|
+
fn test_extract_image_without_exif_returns_empty_map() {
|
|
349
|
+
let bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
350
|
+
let result = extract_image_metadata(&bytes);
|
|
351
|
+
|
|
352
|
+
assert!(result.is_ok());
|
|
353
|
+
let metadata = result.unwrap();
|
|
354
|
+
assert!(metadata.exif_data.is_empty());
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
#[test]
|
|
358
|
+
fn test_extract_exif_data_from_jpeg_with_exif() {
|
|
359
|
+
let bytes = create_test_image(100, 100, ImageFormat::Jpeg);
|
|
360
|
+
let result = extract_image_metadata(&bytes);
|
|
361
|
+
|
|
362
|
+
assert!(result.is_ok());
|
|
363
|
+
let metadata = result.unwrap();
|
|
364
|
+
assert_eq!(metadata.exif_data.len(), 0);
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
#[test]
|
|
368
|
+
fn test_extract_image_metadata_invalid_returns_error() {
|
|
369
|
+
let invalid_bytes = vec![0, 1, 2, 3, 4, 5];
|
|
370
|
+
let result = extract_image_metadata(&invalid_bytes);
|
|
371
|
+
assert!(result.is_err());
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
#[test]
|
|
375
|
+
fn test_extract_image_corrupted_data_returns_error() {
|
|
376
|
+
let mut bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
377
|
+
if bytes.len() > 50 {
|
|
378
|
+
for byte in bytes.iter_mut().take(50).skip(20) {
|
|
379
|
+
*byte = 0xFF;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
let result = extract_image_metadata(&bytes);
|
|
384
|
+
assert!(result.is_ok() || result.is_err());
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
#[test]
|
|
388
|
+
fn test_extract_image_empty_bytes_returns_error() {
|
|
389
|
+
let empty_bytes: Vec<u8> = Vec::new();
|
|
390
|
+
let result = extract_image_metadata(&empty_bytes);
|
|
391
|
+
assert!(result.is_err());
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
#[test]
|
|
395
|
+
fn test_extract_image_unsupported_format_returns_error() {
|
|
396
|
+
let unsupported_bytes = vec![0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87, 0x0A];
|
|
397
|
+
let result = extract_image_metadata(&unsupported_bytes);
|
|
398
|
+
assert!(result.is_err());
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
#[test]
|
|
402
|
+
fn test_extract_very_small_image_1x1_pixel() {
|
|
403
|
+
let bytes = create_test_image(1, 1, ImageFormat::Png);
|
|
404
|
+
let result = extract_image_metadata(&bytes);
|
|
405
|
+
|
|
406
|
+
assert!(result.is_ok());
|
|
407
|
+
let metadata = result.unwrap();
|
|
408
|
+
assert_eq!(metadata.width, 1);
|
|
409
|
+
assert_eq!(metadata.height, 1);
|
|
410
|
+
assert_eq!(metadata.format, "PNG");
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
#[test]
|
|
414
|
+
fn test_extract_large_image_dimensions() {
|
|
415
|
+
let bytes = create_test_image(2048, 1536, ImageFormat::Png);
|
|
416
|
+
let result = extract_image_metadata(&bytes);
|
|
417
|
+
|
|
418
|
+
assert!(result.is_ok());
|
|
419
|
+
let metadata = result.unwrap();
|
|
420
|
+
assert_eq!(metadata.width, 2048);
|
|
421
|
+
assert_eq!(metadata.height, 1536);
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
#[test]
|
|
425
|
+
fn test_extract_image_with_no_metadata_has_empty_exif() {
|
|
426
|
+
let bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
427
|
+
let result = extract_image_metadata(&bytes);
|
|
428
|
+
|
|
429
|
+
assert!(result.is_ok());
|
|
430
|
+
let metadata = result.unwrap();
|
|
431
|
+
assert!(metadata.exif_data.is_empty());
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
#[test]
|
|
435
|
+
fn test_extract_exif_data_returns_empty_map_for_non_jpeg() {
|
|
436
|
+
let png_bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
437
|
+
let exif_data = extract_exif_data(&png_bytes);
|
|
438
|
+
assert!(exif_data.is_empty());
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
#[test]
|
|
442
|
+
fn test_extract_rectangular_image_portrait_orientation() {
|
|
443
|
+
let bytes = create_test_image(400, 800, ImageFormat::Jpeg);
|
|
444
|
+
let result = extract_image_metadata(&bytes);
|
|
445
|
+
|
|
446
|
+
assert!(result.is_ok());
|
|
447
|
+
let metadata = result.unwrap();
|
|
448
|
+
assert_eq!(metadata.width, 400);
|
|
449
|
+
assert_eq!(metadata.height, 800);
|
|
450
|
+
assert!(metadata.height > metadata.width);
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
#[test]
|
|
454
|
+
fn test_extract_rectangular_image_landscape_orientation() {
|
|
455
|
+
let bytes = create_test_image(800, 400, ImageFormat::Png);
|
|
456
|
+
let result = extract_image_metadata(&bytes);
|
|
457
|
+
|
|
458
|
+
assert!(result.is_ok());
|
|
459
|
+
let metadata = result.unwrap();
|
|
460
|
+
assert_eq!(metadata.width, 800);
|
|
461
|
+
assert_eq!(metadata.height, 400);
|
|
462
|
+
assert!(metadata.width > metadata.height);
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
#[test]
|
|
466
|
+
fn test_extract_square_image_equal_dimensions() {
|
|
467
|
+
let bytes = create_test_image(512, 512, ImageFormat::Png);
|
|
468
|
+
let result = extract_image_metadata(&bytes);
|
|
469
|
+
|
|
470
|
+
assert!(result.is_ok());
|
|
471
|
+
let metadata = result.unwrap();
|
|
472
|
+
assert_eq!(metadata.width, 512);
|
|
473
|
+
assert_eq!(metadata.height, 512);
|
|
474
|
+
assert_eq!(metadata.width, metadata.height);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
#[test]
|
|
478
|
+
fn test_extract_metadata_preserves_format_case() {
|
|
479
|
+
let png_bytes = create_test_image(100, 100, ImageFormat::Png);
|
|
480
|
+
let jpeg_bytes = create_test_image(100, 100, ImageFormat::Jpeg);
|
|
481
|
+
let webp_bytes = create_test_image(100, 100, ImageFormat::WebP);
|
|
482
|
+
|
|
483
|
+
let png_meta = extract_image_metadata(&png_bytes).unwrap();
|
|
484
|
+
let jpeg_meta = extract_image_metadata(&jpeg_bytes).unwrap();
|
|
485
|
+
let webp_meta = extract_image_metadata(&webp_bytes).unwrap();
|
|
486
|
+
|
|
487
|
+
assert_eq!(png_meta.format, "PNG");
|
|
488
|
+
assert_eq!(jpeg_meta.format, "JPEG");
|
|
489
|
+
assert_eq!(webp_meta.format, "WEBP");
|
|
490
|
+
}
|
|
491
|
+
}
|