kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +105 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +73 -4
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
|
@@ -1,1055 +1,1055 @@
|
|
|
1
|
-
use serde::{Deserialize, Serialize};
|
|
2
|
-
use std::collections::HashMap;
|
|
3
|
-
|
|
4
|
-
#[cfg(feature = "pdf")]
|
|
5
|
-
use crate::pdf::metadata::PdfMetadata;
|
|
6
|
-
|
|
7
|
-
// ============================================================================
|
|
8
|
-
// ============================================================================
|
|
9
|
-
|
|
10
|
-
/// General extraction result used by the core extraction API.
|
|
11
|
-
///
|
|
12
|
-
/// This is the main result type returned by all extraction functions.
|
|
13
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
14
|
-
pub struct ExtractionResult {
|
|
15
|
-
pub content: String,
|
|
16
|
-
pub mime_type: String,
|
|
17
|
-
pub metadata: Metadata,
|
|
18
|
-
pub tables: Vec<Table>,
|
|
19
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
20
|
-
pub detected_languages: Option<Vec<String>>,
|
|
21
|
-
|
|
22
|
-
/// Text chunks when chunking is enabled.
|
|
23
|
-
///
|
|
24
|
-
/// When chunking configuration is provided, the content is split into
|
|
25
|
-
/// overlapping chunks for efficient processing. Each chunk contains the text,
|
|
26
|
-
/// optional embeddings (if enabled), and metadata about its position.
|
|
27
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
28
|
-
pub chunks: Option<Vec<Chunk>>,
|
|
29
|
-
|
|
30
|
-
/// Extracted images from the document.
|
|
31
|
-
///
|
|
32
|
-
/// When image extraction is enabled via `ImageExtractionConfig`, this field
|
|
33
|
-
/// contains all images found in the document with their raw data and metadata.
|
|
34
|
-
/// Each image may optionally contain a nested `ocr_result` if OCR was performed.
|
|
35
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
36
|
-
pub images: Option<Vec<ExtractedImage>>,
|
|
37
|
-
|
|
38
|
-
/// Per-page content when page extraction is enabled.
|
|
39
|
-
///
|
|
40
|
-
/// When page extraction is configured, the document is split into per-page content
|
|
41
|
-
/// with tables and images mapped to their respective pages.
|
|
42
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
43
|
-
pub pages: Option<Vec<PageContent>>,
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
/// Format-specific metadata (discriminated union).
|
|
47
|
-
///
|
|
48
|
-
/// Only one format type can exist per extraction result. This provides
|
|
49
|
-
/// type-safe, clean metadata without nested optionals.
|
|
50
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
51
|
-
#[serde(tag = "format_type", rename_all = "snake_case")]
|
|
52
|
-
pub enum FormatMetadata {
|
|
53
|
-
#[cfg(feature = "pdf")]
|
|
54
|
-
Pdf(PdfMetadata),
|
|
55
|
-
Excel(ExcelMetadata),
|
|
56
|
-
Email(EmailMetadata),
|
|
57
|
-
Pptx(PptxMetadata),
|
|
58
|
-
Archive(ArchiveMetadata),
|
|
59
|
-
Image(ImageMetadata),
|
|
60
|
-
Xml(XmlMetadata),
|
|
61
|
-
Text(TextMetadata),
|
|
62
|
-
Html(Box<HtmlMetadata>),
|
|
63
|
-
Ocr(OcrMetadata),
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/// Extraction result metadata.
|
|
67
|
-
///
|
|
68
|
-
/// Contains common fields applicable to all formats, format-specific metadata
|
|
69
|
-
/// via a discriminated union, and additional custom fields from postprocessors.
|
|
70
|
-
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
71
|
-
pub struct Metadata {
|
|
72
|
-
/// Document title
|
|
73
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
74
|
-
pub title: Option<String>,
|
|
75
|
-
|
|
76
|
-
/// Document subject or description
|
|
77
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
78
|
-
pub subject: Option<String>,
|
|
79
|
-
|
|
80
|
-
/// Primary author(s) - always Vec for consistency
|
|
81
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
82
|
-
pub authors: Option<Vec<String>>,
|
|
83
|
-
|
|
84
|
-
/// Keywords/tags - always Vec for consistency
|
|
85
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
86
|
-
pub keywords: Option<Vec<String>>,
|
|
87
|
-
|
|
88
|
-
/// Primary language (ISO 639 code)
|
|
89
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
90
|
-
pub language: Option<String>,
|
|
91
|
-
|
|
92
|
-
/// Creation timestamp (ISO 8601 format)
|
|
93
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
94
|
-
pub created_at: Option<String>,
|
|
95
|
-
|
|
96
|
-
/// Last modification timestamp (ISO 8601 format)
|
|
97
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
98
|
-
pub modified_at: Option<String>,
|
|
99
|
-
|
|
100
|
-
/// User who created the document
|
|
101
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
102
|
-
pub created_by: Option<String>,
|
|
103
|
-
|
|
104
|
-
/// User who last modified the document
|
|
105
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
106
|
-
pub modified_by: Option<String>,
|
|
107
|
-
|
|
108
|
-
/// Page/slide/sheet structure with boundaries
|
|
109
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
110
|
-
pub pages: Option<PageStructure>,
|
|
111
|
-
|
|
112
|
-
/// Document date (DEPRECATED - use created_at/modified_at instead)
|
|
113
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
114
|
-
pub date: Option<String>,
|
|
115
|
-
|
|
116
|
-
/// Format-specific metadata (discriminated union)
|
|
117
|
-
///
|
|
118
|
-
/// Contains detailed metadata specific to the document format.
|
|
119
|
-
/// Serializes with a `format_type` discriminator field.
|
|
120
|
-
#[serde(flatten, skip_serializing_if = "Option::is_none")]
|
|
121
|
-
pub format: Option<FormatMetadata>,
|
|
122
|
-
|
|
123
|
-
/// Image preprocessing metadata (when OCR preprocessing was applied)
|
|
124
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
125
|
-
pub image_preprocessing: Option<ImagePreprocessingMetadata>,
|
|
126
|
-
|
|
127
|
-
/// JSON schema (for structured data extraction)
|
|
128
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
129
|
-
pub json_schema: Option<serde_json::Value>,
|
|
130
|
-
|
|
131
|
-
/// Error metadata (for batch operations)
|
|
132
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
133
|
-
pub error: Option<ErrorMetadata>,
|
|
134
|
-
|
|
135
|
-
/// Additional custom fields from postprocessors.
|
|
136
|
-
///
|
|
137
|
-
/// This flattened HashMap allows Python/TypeScript postprocessors to add
|
|
138
|
-
/// arbitrary fields (entity extraction, keyword extraction, etc.).
|
|
139
|
-
/// Fields are merged at the root level during serialization.
|
|
140
|
-
#[serde(flatten)]
|
|
141
|
-
pub additional: HashMap<String, serde_json::Value>,
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
/// Unified page structure for documents.
|
|
145
|
-
///
|
|
146
|
-
/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
|
147
|
-
/// with character offset boundaries for chunk-to-page mapping.
|
|
148
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
149
|
-
pub struct PageStructure {
|
|
150
|
-
/// Total number of pages/slides/sheets
|
|
151
|
-
pub total_count: usize,
|
|
152
|
-
|
|
153
|
-
/// Type of paginated unit
|
|
154
|
-
pub unit_type: PageUnitType,
|
|
155
|
-
|
|
156
|
-
/// Character offset boundaries for each page
|
|
157
|
-
///
|
|
158
|
-
/// Maps character ranges in the extracted content to page numbers.
|
|
159
|
-
/// Used for chunk page range calculation.
|
|
160
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
161
|
-
pub boundaries: Option<Vec<PageBoundary>>,
|
|
162
|
-
|
|
163
|
-
/// Detailed per-page metadata (optional, only when needed)
|
|
164
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
165
|
-
pub pages: Option<Vec<PageInfo>>,
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
/// Type of paginated unit in a document.
|
|
169
|
-
///
|
|
170
|
-
/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
|
|
171
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
172
|
-
#[serde(rename_all = "snake_case")]
|
|
173
|
-
pub enum PageUnitType {
|
|
174
|
-
/// Standard document pages (PDF, DOCX, images)
|
|
175
|
-
Page,
|
|
176
|
-
/// Presentation slides (PPTX, ODP)
|
|
177
|
-
Slide,
|
|
178
|
-
/// Spreadsheet sheets (XLSX, ODS)
|
|
179
|
-
Sheet,
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
/// Byte offset boundary for a page.
|
|
183
|
-
///
|
|
184
|
-
/// Tracks where a specific page's content starts and ends in the main content string,
|
|
185
|
-
/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|
186
|
-
/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|
187
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
188
|
-
pub struct PageBoundary {
|
|
189
|
-
/// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
|
190
|
-
pub byte_start: usize,
|
|
191
|
-
/// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
|
|
192
|
-
pub byte_end: usize,
|
|
193
|
-
/// Page number (1-indexed)
|
|
194
|
-
pub page_number: usize,
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
/// Metadata for individual page/slide/sheet.
|
|
198
|
-
///
|
|
199
|
-
/// Captures per-page information including dimensions, content counts,
|
|
200
|
-
/// and visibility state (for presentations).
|
|
201
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
202
|
-
pub struct PageInfo {
|
|
203
|
-
/// Page number (1-indexed)
|
|
204
|
-
pub number: usize,
|
|
205
|
-
|
|
206
|
-
/// Page title (usually for presentations)
|
|
207
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
208
|
-
pub title: Option<String>,
|
|
209
|
-
|
|
210
|
-
/// Dimensions in points (PDF) or pixels (images): (width, height)
|
|
211
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
212
|
-
pub dimensions: Option<(f64, f64)>,
|
|
213
|
-
|
|
214
|
-
/// Number of images on this page
|
|
215
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
216
|
-
pub image_count: Option<usize>,
|
|
217
|
-
|
|
218
|
-
/// Number of tables on this page
|
|
219
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
220
|
-
pub table_count: Option<usize>,
|
|
221
|
-
|
|
222
|
-
/// Whether this page is hidden (e.g., in presentations)
|
|
223
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
224
|
-
pub hidden: Option<bool>,
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
/// Content for a single page/slide.
|
|
228
|
-
///
|
|
229
|
-
/// When page extraction is enabled, documents are split into per-page content
|
|
230
|
-
/// with associated tables and images mapped to each page.
|
|
231
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
232
|
-
pub struct PageContent {
|
|
233
|
-
/// Page number (1-indexed)
|
|
234
|
-
pub page_number: usize,
|
|
235
|
-
|
|
236
|
-
/// Text content for this page
|
|
237
|
-
pub content: String,
|
|
238
|
-
|
|
239
|
-
/// Tables found on this page
|
|
240
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
241
|
-
pub tables: Vec<Table>,
|
|
242
|
-
|
|
243
|
-
/// Images found on this page
|
|
244
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
245
|
-
pub images: Vec<ExtractedImage>,
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
/// Excel/spreadsheet metadata.
|
|
249
|
-
///
|
|
250
|
-
/// Contains information about sheets in Excel, LibreOffice Calc, and other
|
|
251
|
-
/// spreadsheet formats (.xlsx, .xls, .ods, etc.).
|
|
252
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
253
|
-
pub struct ExcelMetadata {
|
|
254
|
-
/// Total number of sheets in the workbook
|
|
255
|
-
pub sheet_count: usize,
|
|
256
|
-
/// Names of all sheets in order
|
|
257
|
-
pub sheet_names: Vec<String>,
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
/// Email metadata extracted from .eml and .msg files.
|
|
261
|
-
///
|
|
262
|
-
/// Includes sender/recipient information, message ID, and attachment list.
|
|
263
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
264
|
-
pub struct EmailMetadata {
|
|
265
|
-
/// Sender's email address
|
|
266
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
267
|
-
pub from_email: Option<String>,
|
|
268
|
-
|
|
269
|
-
/// Sender's display name
|
|
270
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
271
|
-
pub from_name: Option<String>,
|
|
272
|
-
|
|
273
|
-
/// Primary recipients
|
|
274
|
-
pub to_emails: Vec<String>,
|
|
275
|
-
/// CC recipients
|
|
276
|
-
pub cc_emails: Vec<String>,
|
|
277
|
-
/// BCC recipients
|
|
278
|
-
pub bcc_emails: Vec<String>,
|
|
279
|
-
|
|
280
|
-
/// Message-ID header value
|
|
281
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
282
|
-
pub message_id: Option<String>,
|
|
283
|
-
|
|
284
|
-
/// List of attachment filenames
|
|
285
|
-
pub attachments: Vec<String>,
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
/// Archive (ZIP/TAR/7Z) metadata.
|
|
289
|
-
///
|
|
290
|
-
/// Extracted from compressed archive files containing file lists and size information.
|
|
291
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
292
|
-
pub struct ArchiveMetadata {
|
|
293
|
-
/// Archive format ("ZIP", "TAR", "7Z", etc.)
|
|
294
|
-
pub format: String,
|
|
295
|
-
/// Total number of files in the archive
|
|
296
|
-
pub file_count: usize,
|
|
297
|
-
/// List of file paths within the archive
|
|
298
|
-
pub file_list: Vec<String>,
|
|
299
|
-
/// Total uncompressed size in bytes
|
|
300
|
-
pub total_size: usize,
|
|
301
|
-
|
|
302
|
-
/// Compressed size in bytes (if available)
|
|
303
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
304
|
-
pub compressed_size: Option<usize>,
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
/// Image metadata extracted from image files.
|
|
308
|
-
///
|
|
309
|
-
/// Includes dimensions, format, and EXIF data.
|
|
310
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
311
|
-
pub struct ImageMetadata {
|
|
312
|
-
/// Image width in pixels
|
|
313
|
-
pub width: u32,
|
|
314
|
-
/// Image height in pixels
|
|
315
|
-
pub height: u32,
|
|
316
|
-
/// Image format (e.g., "PNG", "JPEG", "TIFF")
|
|
317
|
-
pub format: String,
|
|
318
|
-
/// EXIF metadata tags
|
|
319
|
-
pub exif: HashMap<String, String>,
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
/// XML metadata extracted during XML parsing.
|
|
323
|
-
///
|
|
324
|
-
/// Provides statistics about XML document structure.
|
|
325
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
326
|
-
pub struct XmlMetadata {
|
|
327
|
-
/// Total number of XML elements processed
|
|
328
|
-
pub element_count: usize,
|
|
329
|
-
/// List of unique element tag names (sorted)
|
|
330
|
-
pub unique_elements: Vec<String>,
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
/// Text/Markdown metadata.
|
|
334
|
-
///
|
|
335
|
-
/// Extracted from plain text and Markdown files. Includes word counts and,
|
|
336
|
-
/// for Markdown, structural elements like headers and links.
|
|
337
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
338
|
-
pub struct TextMetadata {
|
|
339
|
-
/// Number of lines in the document
|
|
340
|
-
pub line_count: usize,
|
|
341
|
-
/// Number of words
|
|
342
|
-
pub word_count: usize,
|
|
343
|
-
/// Number of characters
|
|
344
|
-
pub character_count: usize,
|
|
345
|
-
|
|
346
|
-
/// Markdown headers (headings text only, for Markdown files)
|
|
347
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
348
|
-
pub headers: Option<Vec<String>>,
|
|
349
|
-
|
|
350
|
-
/// Markdown links as (text, url) tuples (for Markdown files)
|
|
351
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
352
|
-
pub links: Option<Vec<(String, String)>>,
|
|
353
|
-
|
|
354
|
-
/// Code blocks as (language, code) tuples (for Markdown files)
|
|
355
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
356
|
-
pub code_blocks: Option<Vec<(String, String)>>,
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
/// HTML metadata extracted from HTML documents.
|
|
360
|
-
///
|
|
361
|
-
/// Includes meta tags, Open Graph data, Twitter Card metadata, and link relations.
|
|
362
|
-
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
363
|
-
pub struct HtmlMetadata {
|
|
364
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
365
|
-
pub title: Option<String>,
|
|
366
|
-
|
|
367
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
368
|
-
pub description: Option<String>,
|
|
369
|
-
|
|
370
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
371
|
-
pub keywords: Option<String>,
|
|
372
|
-
|
|
373
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
374
|
-
pub author: Option<String>,
|
|
375
|
-
|
|
376
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
377
|
-
pub canonical: Option<String>,
|
|
378
|
-
|
|
379
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
380
|
-
pub base_href: Option<String>,
|
|
381
|
-
|
|
382
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
383
|
-
pub og_title: Option<String>,
|
|
384
|
-
|
|
385
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
386
|
-
pub og_description: Option<String>,
|
|
387
|
-
|
|
388
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
389
|
-
pub og_image: Option<String>,
|
|
390
|
-
|
|
391
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
392
|
-
pub og_url: Option<String>,
|
|
393
|
-
|
|
394
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
395
|
-
pub og_type: Option<String>,
|
|
396
|
-
|
|
397
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
398
|
-
pub og_site_name: Option<String>,
|
|
399
|
-
|
|
400
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
401
|
-
pub twitter_card: Option<String>,
|
|
402
|
-
|
|
403
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
404
|
-
pub twitter_title: Option<String>,
|
|
405
|
-
|
|
406
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
407
|
-
pub twitter_description: Option<String>,
|
|
408
|
-
|
|
409
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
410
|
-
pub twitter_image: Option<String>,
|
|
411
|
-
|
|
412
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
413
|
-
pub twitter_site: Option<String>,
|
|
414
|
-
|
|
415
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
416
|
-
pub twitter_creator: Option<String>,
|
|
417
|
-
|
|
418
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
419
|
-
pub link_author: Option<String>,
|
|
420
|
-
|
|
421
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
422
|
-
pub link_license: Option<String>,
|
|
423
|
-
|
|
424
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
425
|
-
pub link_alternate: Option<String>,
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
/// OCR processing metadata.
|
|
429
|
-
///
|
|
430
|
-
/// Captures information about OCR processing configuration and results.
|
|
431
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
432
|
-
pub struct OcrMetadata {
|
|
433
|
-
/// OCR language code(s) used
|
|
434
|
-
pub language: String,
|
|
435
|
-
/// Tesseract Page Segmentation Mode (PSM)
|
|
436
|
-
pub psm: i32,
|
|
437
|
-
/// Output format (e.g., "text", "hocr")
|
|
438
|
-
pub output_format: String,
|
|
439
|
-
/// Number of tables detected
|
|
440
|
-
pub table_count: usize,
|
|
441
|
-
|
|
442
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
443
|
-
pub table_rows: Option<usize>,
|
|
444
|
-
|
|
445
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
446
|
-
pub table_cols: Option<usize>,
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
/// Error metadata (for batch operations).
|
|
450
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
451
|
-
pub struct ErrorMetadata {
|
|
452
|
-
pub error_type: String,
|
|
453
|
-
pub message: String,
|
|
454
|
-
}
|
|
455
|
-
|
|
456
|
-
/// Extracted table structure.
|
|
457
|
-
///
|
|
458
|
-
/// Represents a table detected and extracted from a document (PDF, image, etc.).
|
|
459
|
-
/// Tables are converted to both structured cell data and Markdown format.
|
|
460
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
461
|
-
pub struct Table {
|
|
462
|
-
/// Table cells as a 2D vector (rows × columns)
|
|
463
|
-
pub cells: Vec<Vec<String>>,
|
|
464
|
-
/// Markdown representation of the table
|
|
465
|
-
pub markdown: String,
|
|
466
|
-
/// Page number where the table was found (1-indexed)
|
|
467
|
-
pub page_number: usize,
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
/// A text chunk with optional embedding and metadata.
|
|
471
|
-
///
|
|
472
|
-
/// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
|
473
|
-
/// contains the text content, optional embedding vector (if embedding generation
|
|
474
|
-
/// is configured), and metadata about its position in the document.
|
|
475
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
476
|
-
pub struct Chunk {
|
|
477
|
-
/// The text content of this chunk.
|
|
478
|
-
pub content: String,
|
|
479
|
-
|
|
480
|
-
/// Optional embedding vector for this chunk.
|
|
481
|
-
///
|
|
482
|
-
/// Only populated when `EmbeddingConfig` is provided in chunking configuration.
|
|
483
|
-
/// The dimensionality depends on the chosen embedding model.
|
|
484
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
485
|
-
pub embedding: Option<Vec<f32>>,
|
|
486
|
-
|
|
487
|
-
/// Metadata about this chunk's position and properties.
|
|
488
|
-
pub metadata: ChunkMetadata,
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
/// Metadata about a chunk's position in the original document.
|
|
492
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
493
|
-
pub struct ChunkMetadata {
|
|
494
|
-
/// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
|
495
|
-
pub byte_start: usize,
|
|
496
|
-
|
|
497
|
-
/// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
|
|
498
|
-
pub byte_end: usize,
|
|
499
|
-
|
|
500
|
-
/// Number of tokens in this chunk (if available).
|
|
501
|
-
///
|
|
502
|
-
/// This is calculated by the embedding model's tokenizer if embeddings are enabled.
|
|
503
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
504
|
-
pub token_count: Option<usize>,
|
|
505
|
-
|
|
506
|
-
/// Zero-based index of this chunk in the document.
|
|
507
|
-
pub chunk_index: usize,
|
|
508
|
-
|
|
509
|
-
/// Total number of chunks in the document.
|
|
510
|
-
pub total_chunks: usize,
|
|
511
|
-
|
|
512
|
-
/// First page number this chunk spans (1-indexed).
|
|
513
|
-
///
|
|
514
|
-
/// Only populated when page tracking is enabled in extraction configuration.
|
|
515
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
516
|
-
pub first_page: Option<usize>,
|
|
517
|
-
|
|
518
|
-
/// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
|
519
|
-
///
|
|
520
|
-
/// Only populated when page tracking is enabled in extraction configuration.
|
|
521
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
522
|
-
pub last_page: Option<usize>,
|
|
523
|
-
}
|
|
524
|
-
|
|
525
|
-
/// Extracted image from a document.
|
|
526
|
-
///
|
|
527
|
-
/// Contains raw image data, metadata, and optional nested OCR results.
|
|
528
|
-
/// Raw bytes allow cross-language compatibility - users can convert to
|
|
529
|
-
/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
|
530
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
531
|
-
pub struct ExtractedImage {
|
|
532
|
-
/// Raw image data (PNG, JPEG, WebP, etc. bytes)
|
|
533
|
-
pub data: Vec<u8>,
|
|
534
|
-
|
|
535
|
-
/// Image format (e.g., "jpeg", "png", "webp")
|
|
536
|
-
pub format: String,
|
|
537
|
-
|
|
538
|
-
/// Zero-indexed position of this image in the document/page
|
|
539
|
-
pub image_index: usize,
|
|
540
|
-
|
|
541
|
-
/// Page/slide number where image was found (1-indexed)
|
|
542
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
543
|
-
pub page_number: Option<usize>,
|
|
544
|
-
|
|
545
|
-
/// Image width in pixels
|
|
546
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
547
|
-
pub width: Option<u32>,
|
|
548
|
-
|
|
549
|
-
/// Image height in pixels
|
|
550
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
551
|
-
pub height: Option<u32>,
|
|
552
|
-
|
|
553
|
-
/// Colorspace information (e.g., "RGB", "CMYK", "Gray")
|
|
554
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
555
|
-
pub colorspace: Option<String>,
|
|
556
|
-
|
|
557
|
-
/// Bits per color component (e.g., 8, 16)
|
|
558
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
559
|
-
pub bits_per_component: Option<u32>,
|
|
560
|
-
|
|
561
|
-
/// Whether this image is a mask image
|
|
562
|
-
#[serde(default)]
|
|
563
|
-
pub is_mask: bool,
|
|
564
|
-
|
|
565
|
-
/// Optional description of the image
|
|
566
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
567
|
-
pub description: Option<String>,
|
|
568
|
-
|
|
569
|
-
/// Nested OCR extraction result (if image was OCRed)
|
|
570
|
-
///
|
|
571
|
-
/// When OCR is performed on this image, the result is embedded here
|
|
572
|
-
/// rather than in a separate collection, making the relationship explicit.
|
|
573
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
574
|
-
pub ocr_result: Option<Box<ExtractionResult>>,
|
|
575
|
-
}
|
|
576
|
-
|
|
577
|
-
/// Excel workbook representation.
|
|
578
|
-
///
|
|
579
|
-
/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
|
580
|
-
/// extracted content and metadata.
|
|
581
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
582
|
-
pub struct ExcelWorkbook {
|
|
583
|
-
/// All sheets in the workbook
|
|
584
|
-
pub sheets: Vec<ExcelSheet>,
|
|
585
|
-
/// Workbook-level metadata (author, creation date, etc.)
|
|
586
|
-
pub metadata: HashMap<String, String>,
|
|
587
|
-
}
|
|
588
|
-
|
|
589
|
-
/// Single Excel worksheet.
|
|
590
|
-
///
|
|
591
|
-
/// Represents one sheet from an Excel workbook with its content
|
|
592
|
-
/// converted to Markdown format and dimensional statistics.
|
|
593
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
594
|
-
pub struct ExcelSheet {
|
|
595
|
-
/// Sheet name as it appears in Excel
|
|
596
|
-
pub name: String,
|
|
597
|
-
/// Sheet content converted to Markdown tables
|
|
598
|
-
pub markdown: String,
|
|
599
|
-
/// Number of rows
|
|
600
|
-
pub row_count: usize,
|
|
601
|
-
/// Number of columns
|
|
602
|
-
pub col_count: usize,
|
|
603
|
-
/// Total number of non-empty cells
|
|
604
|
-
pub cell_count: usize,
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
/// XML extraction result.
|
|
608
|
-
///
|
|
609
|
-
/// Contains extracted text content from XML files along with
|
|
610
|
-
/// structural statistics about the XML document.
|
|
611
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
612
|
-
pub struct XmlExtractionResult {
|
|
613
|
-
/// Extracted text content (XML structure filtered out)
|
|
614
|
-
pub content: String,
|
|
615
|
-
/// Total number of XML elements processed
|
|
616
|
-
pub element_count: usize,
|
|
617
|
-
/// List of unique element names found (sorted)
|
|
618
|
-
pub unique_elements: Vec<String>,
|
|
619
|
-
}
|
|
620
|
-
|
|
621
|
-
/// Plain text and Markdown extraction result.
|
|
622
|
-
///
|
|
623
|
-
/// Contains the extracted text along with statistics and,
|
|
624
|
-
/// for Markdown files, structural elements like headers and links.
|
|
625
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
626
|
-
pub struct TextExtractionResult {
|
|
627
|
-
/// Extracted text content
|
|
628
|
-
pub content: String,
|
|
629
|
-
/// Number of lines
|
|
630
|
-
pub line_count: usize,
|
|
631
|
-
/// Number of words
|
|
632
|
-
pub word_count: usize,
|
|
633
|
-
/// Number of characters
|
|
634
|
-
pub character_count: usize,
|
|
635
|
-
/// Markdown headers (text only, Markdown files only)
|
|
636
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
637
|
-
pub headers: Option<Vec<String>>,
|
|
638
|
-
/// Markdown links as (text, URL) tuples (Markdown files only)
|
|
639
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
640
|
-
pub links: Option<Vec<(String, String)>>,
|
|
641
|
-
/// Code blocks as (language, code) tuples (Markdown files only)
|
|
642
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
643
|
-
pub code_blocks: Option<Vec<(String, String)>>,
|
|
644
|
-
}
|
|
645
|
-
|
|
646
|
-
/// PowerPoint (PPTX) extraction result.
|
|
647
|
-
///
|
|
648
|
-
/// Contains extracted slide content, metadata, and embedded images/tables.
|
|
649
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
650
|
-
pub struct PptxExtractionResult {
|
|
651
|
-
/// Extracted text content from all slides
|
|
652
|
-
pub content: String,
|
|
653
|
-
/// Presentation metadata
|
|
654
|
-
pub metadata: PptxMetadata,
|
|
655
|
-
/// Total number of slides
|
|
656
|
-
pub slide_count: usize,
|
|
657
|
-
/// Total number of embedded images
|
|
658
|
-
pub image_count: usize,
|
|
659
|
-
/// Total number of tables
|
|
660
|
-
pub table_count: usize,
|
|
661
|
-
/// Extracted images from the presentation
|
|
662
|
-
pub images: Vec<ExtractedImage>,
|
|
663
|
-
/// Slide structure with boundaries (when page tracking is enabled)
|
|
664
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
665
|
-
pub page_structure: Option<PageStructure>,
|
|
666
|
-
/// Per-slide content (when page tracking is enabled)
|
|
667
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
668
|
-
pub page_contents: Option<Vec<PageContent>>,
|
|
669
|
-
}
|
|
670
|
-
|
|
671
|
-
/// PowerPoint presentation metadata.
|
|
672
|
-
///
|
|
673
|
-
/// Contains PPTX-specific metadata. Common fields like title, author, and description
|
|
674
|
-
/// are now in the base `Metadata` struct.
|
|
675
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
676
|
-
pub struct PptxMetadata {
|
|
677
|
-
/// List of fonts used in the presentation
|
|
678
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
679
|
-
pub fonts: Vec<String>,
|
|
680
|
-
}
|
|
681
|
-
|
|
682
|
-
/// Email extraction result.
|
|
683
|
-
///
|
|
684
|
-
/// Complete representation of an extracted email message (.eml or .msg)
|
|
685
|
-
/// including headers, body content, and attachments.
|
|
686
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
687
|
-
pub struct EmailExtractionResult {
|
|
688
|
-
/// Email subject line
|
|
689
|
-
pub subject: Option<String>,
|
|
690
|
-
/// Sender email address
|
|
691
|
-
pub from_email: Option<String>,
|
|
692
|
-
/// Primary recipient email addresses
|
|
693
|
-
pub to_emails: Vec<String>,
|
|
694
|
-
/// CC recipient email addresses
|
|
695
|
-
pub cc_emails: Vec<String>,
|
|
696
|
-
/// BCC recipient email addresses
|
|
697
|
-
pub bcc_emails: Vec<String>,
|
|
698
|
-
/// Email date/timestamp
|
|
699
|
-
pub date: Option<String>,
|
|
700
|
-
/// Message-ID header value
|
|
701
|
-
pub message_id: Option<String>,
|
|
702
|
-
/// Plain text version of the email body
|
|
703
|
-
pub plain_text: Option<String>,
|
|
704
|
-
/// HTML version of the email body
|
|
705
|
-
pub html_content: Option<String>,
|
|
706
|
-
/// Cleaned/processed text content
|
|
707
|
-
pub cleaned_text: String,
|
|
708
|
-
/// List of email attachments
|
|
709
|
-
pub attachments: Vec<EmailAttachment>,
|
|
710
|
-
/// Additional email headers and metadata
|
|
711
|
-
pub metadata: HashMap<String, String>,
|
|
712
|
-
}
|
|
713
|
-
|
|
714
|
-
/// Email attachment representation.
|
|
715
|
-
///
|
|
716
|
-
/// Contains metadata and optionally the content of an email attachment.
|
|
717
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
718
|
-
pub struct EmailAttachment {
|
|
719
|
-
/// Attachment name (from Content-Disposition header)
|
|
720
|
-
pub name: Option<String>,
|
|
721
|
-
/// Filename of the attachment
|
|
722
|
-
pub filename: Option<String>,
|
|
723
|
-
/// MIME type of the attachment
|
|
724
|
-
pub mime_type: Option<String>,
|
|
725
|
-
/// Size in bytes
|
|
726
|
-
pub size: Option<usize>,
|
|
727
|
-
/// Whether this attachment is an image
|
|
728
|
-
pub is_image: bool,
|
|
729
|
-
/// Attachment data (if extracted)
|
|
730
|
-
pub data: Option<Vec<u8>>,
|
|
731
|
-
}
|
|
732
|
-
|
|
733
|
-
/// OCR extraction result.
|
|
734
|
-
///
|
|
735
|
-
/// Result of performing OCR on an image or scanned document,
|
|
736
|
-
/// including recognized text and detected tables.
|
|
737
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
738
|
-
pub struct OcrExtractionResult {
|
|
739
|
-
/// Recognized text content
|
|
740
|
-
pub content: String,
|
|
741
|
-
/// Original MIME type of the processed image
|
|
742
|
-
pub mime_type: String,
|
|
743
|
-
/// OCR processing metadata (confidence scores, language, etc.)
|
|
744
|
-
pub metadata: HashMap<String, serde_json::Value>,
|
|
745
|
-
/// Tables detected and extracted via OCR
|
|
746
|
-
pub tables: Vec<OcrTable>,
|
|
747
|
-
}
|
|
748
|
-
|
|
749
|
-
/// Table detected via OCR.
|
|
750
|
-
///
|
|
751
|
-
/// Represents a table structure recognized during OCR processing.
|
|
752
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
753
|
-
pub struct OcrTable {
|
|
754
|
-
/// Table cells as a 2D vector (rows × columns)
|
|
755
|
-
pub cells: Vec<Vec<String>>,
|
|
756
|
-
/// Markdown representation of the table
|
|
757
|
-
pub markdown: String,
|
|
758
|
-
/// Page number where the table was found (1-indexed)
|
|
759
|
-
pub page_number: usize,
|
|
760
|
-
}
|
|
761
|
-
|
|
762
|
-
/// Image preprocessing configuration for OCR.
|
|
763
|
-
///
|
|
764
|
-
/// These settings control how images are preprocessed before OCR to improve
|
|
765
|
-
/// text recognition quality. Different preprocessing strategies work better
|
|
766
|
-
/// for different document types.
|
|
767
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
768
|
-
#[serde(default)]
|
|
769
|
-
pub struct ImagePreprocessingConfig {
|
|
770
|
-
/// Target DPI for the image (300 is standard, 600 for small text).
|
|
771
|
-
pub target_dpi: i32,
|
|
772
|
-
|
|
773
|
-
/// Auto-detect and correct image rotation.
|
|
774
|
-
pub auto_rotate: bool,
|
|
775
|
-
|
|
776
|
-
/// Correct skew (tilted images).
|
|
777
|
-
pub deskew: bool,
|
|
778
|
-
|
|
779
|
-
/// Remove noise from the image.
|
|
780
|
-
pub denoise: bool,
|
|
781
|
-
|
|
782
|
-
/// Enhance contrast for better text visibility.
|
|
783
|
-
pub contrast_enhance: bool,
|
|
784
|
-
|
|
785
|
-
/// Binarization method: "otsu", "sauvola", "adaptive".
|
|
786
|
-
pub binarization_method: String,
|
|
787
|
-
|
|
788
|
-
/// Invert colors (white text on black → black on white).
|
|
789
|
-
pub invert_colors: bool,
|
|
790
|
-
}
|
|
791
|
-
|
|
792
|
-
impl Default for ImagePreprocessingConfig {
|
|
793
|
-
fn default() -> Self {
|
|
794
|
-
Self {
|
|
795
|
-
target_dpi: 300,
|
|
796
|
-
auto_rotate: true,
|
|
797
|
-
deskew: true,
|
|
798
|
-
denoise: false,
|
|
799
|
-
contrast_enhance: false,
|
|
800
|
-
binarization_method: "otsu".to_string(),
|
|
801
|
-
invert_colors: false,
|
|
802
|
-
}
|
|
803
|
-
}
|
|
804
|
-
}
|
|
805
|
-
|
|
806
|
-
/// Tesseract OCR configuration.
|
|
807
|
-
///
|
|
808
|
-
/// Provides fine-grained control over Tesseract OCR engine parameters.
|
|
809
|
-
/// Most users can use the defaults, but these settings allow optimization
|
|
810
|
-
/// for specific document types (invoices, handwriting, etc.).
|
|
811
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
812
|
-
#[serde(default)]
|
|
813
|
-
pub struct TesseractConfig {
|
|
814
|
-
/// Language code (e.g., "eng", "deu", "fra")
|
|
815
|
-
pub language: String,
|
|
816
|
-
|
|
817
|
-
/// Page Segmentation Mode (0-13).
|
|
818
|
-
///
|
|
819
|
-
/// Common values:
|
|
820
|
-
/// - 3: Fully automatic page segmentation (default)
|
|
821
|
-
/// - 6: Assume a single uniform block of text
|
|
822
|
-
/// - 11: Sparse text with no particular order
|
|
823
|
-
pub psm: i32,
|
|
824
|
-
|
|
825
|
-
/// Output format ("text" or "markdown")
|
|
826
|
-
pub output_format: String,
|
|
827
|
-
|
|
828
|
-
/// OCR Engine Mode (0-3).
|
|
829
|
-
///
|
|
830
|
-
/// - 0: Legacy engine only
|
|
831
|
-
/// - 1: Neural nets (LSTM) only (usually best)
|
|
832
|
-
/// - 2: Legacy + LSTM
|
|
833
|
-
/// - 3: Default (based on what's available)
|
|
834
|
-
pub oem: i32,
|
|
835
|
-
|
|
836
|
-
/// Minimum confidence threshold (0.0-100.0).
|
|
837
|
-
///
|
|
838
|
-
/// Words with confidence below this threshold may be rejected or flagged.
|
|
839
|
-
pub min_confidence: f64,
|
|
840
|
-
|
|
841
|
-
/// Image preprocessing configuration.
|
|
842
|
-
///
|
|
843
|
-
/// Controls how images are preprocessed before OCR. Can significantly
|
|
844
|
-
/// improve quality for scanned documents or low-quality images.
|
|
845
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
846
|
-
pub preprocessing: Option<ImagePreprocessingConfig>,
|
|
847
|
-
|
|
848
|
-
/// Enable automatic table detection and reconstruction
|
|
849
|
-
pub enable_table_detection: bool,
|
|
850
|
-
|
|
851
|
-
/// Minimum confidence threshold for table detection (0.0-1.0)
|
|
852
|
-
pub table_min_confidence: f64,
|
|
853
|
-
|
|
854
|
-
/// Column threshold for table detection (pixels)
|
|
855
|
-
pub table_column_threshold: i32,
|
|
856
|
-
|
|
857
|
-
/// Row threshold ratio for table detection (0.0-1.0)
|
|
858
|
-
pub table_row_threshold_ratio: f64,
|
|
859
|
-
|
|
860
|
-
/// Enable OCR result caching
|
|
861
|
-
pub use_cache: bool,
|
|
862
|
-
|
|
863
|
-
/// Use pre-adapted templates for character classification
|
|
864
|
-
pub classify_use_pre_adapted_templates: bool,
|
|
865
|
-
|
|
866
|
-
/// Enable N-gram language model
|
|
867
|
-
pub language_model_ngram_on: bool,
|
|
868
|
-
|
|
869
|
-
/// Don't reject good words during block-level processing
|
|
870
|
-
pub tessedit_dont_blkrej_good_wds: bool,
|
|
871
|
-
|
|
872
|
-
/// Don't reject good words during row-level processing
|
|
873
|
-
pub tessedit_dont_rowrej_good_wds: bool,
|
|
874
|
-
|
|
875
|
-
/// Enable dictionary correction
|
|
876
|
-
pub tessedit_enable_dict_correction: bool,
|
|
877
|
-
|
|
878
|
-
/// Whitelist of allowed characters (empty = all allowed)
|
|
879
|
-
pub tessedit_char_whitelist: String,
|
|
880
|
-
|
|
881
|
-
/// Blacklist of forbidden characters (empty = none forbidden)
|
|
882
|
-
pub tessedit_char_blacklist: String,
|
|
883
|
-
|
|
884
|
-
/// Use primary language params model
|
|
885
|
-
pub tessedit_use_primary_params_model: bool,
|
|
886
|
-
|
|
887
|
-
/// Variable-width space detection
|
|
888
|
-
pub textord_space_size_is_variable: bool,
|
|
889
|
-
|
|
890
|
-
/// Use adaptive thresholding method
|
|
891
|
-
pub thresholding_method: bool,
|
|
892
|
-
}
|
|
893
|
-
|
|
894
|
-
impl Default for TesseractConfig {
|
|
895
|
-
fn default() -> Self {
|
|
896
|
-
Self {
|
|
897
|
-
language: "eng".to_string(),
|
|
898
|
-
psm: 3,
|
|
899
|
-
output_format: "markdown".to_string(),
|
|
900
|
-
oem: 3,
|
|
901
|
-
min_confidence: 0.0,
|
|
902
|
-
preprocessing: None,
|
|
903
|
-
enable_table_detection: true,
|
|
904
|
-
table_min_confidence: 0.0,
|
|
905
|
-
table_column_threshold: 50,
|
|
906
|
-
table_row_threshold_ratio: 0.5,
|
|
907
|
-
use_cache: true,
|
|
908
|
-
classify_use_pre_adapted_templates: true,
|
|
909
|
-
language_model_ngram_on: false,
|
|
910
|
-
tessedit_dont_blkrej_good_wds: true,
|
|
911
|
-
tessedit_dont_rowrej_good_wds: true,
|
|
912
|
-
tessedit_enable_dict_correction: true,
|
|
913
|
-
tessedit_char_whitelist: String::new(),
|
|
914
|
-
tessedit_char_blacklist: String::new(),
|
|
915
|
-
tessedit_use_primary_params_model: true,
|
|
916
|
-
textord_space_size_is_variable: true,
|
|
917
|
-
thresholding_method: false,
|
|
918
|
-
}
|
|
919
|
-
}
|
|
920
|
-
}
|
|
921
|
-
|
|
922
|
-
/// Image preprocessing metadata.
|
|
923
|
-
///
|
|
924
|
-
/// Tracks the transformations applied to an image during OCR preprocessing,
|
|
925
|
-
/// including DPI normalization, resizing, and resampling.
|
|
926
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
927
|
-
pub struct ImagePreprocessingMetadata {
|
|
928
|
-
/// Original image dimensions (width, height) in pixels
|
|
929
|
-
pub original_dimensions: (usize, usize),
|
|
930
|
-
/// Original image DPI (horizontal, vertical)
|
|
931
|
-
pub original_dpi: (f64, f64),
|
|
932
|
-
/// Target DPI from configuration
|
|
933
|
-
pub target_dpi: i32,
|
|
934
|
-
/// Scaling factor applied to the image
|
|
935
|
-
pub scale_factor: f64,
|
|
936
|
-
/// Whether DPI was auto-adjusted based on content
|
|
937
|
-
pub auto_adjusted: bool,
|
|
938
|
-
/// Final DPI after processing
|
|
939
|
-
pub final_dpi: i32,
|
|
940
|
-
/// New dimensions after resizing (if resized)
|
|
941
|
-
pub new_dimensions: Option<(usize, usize)>,
|
|
942
|
-
/// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
|
|
943
|
-
pub resample_method: String,
|
|
944
|
-
/// Whether dimensions were clamped to max_image_dimension
|
|
945
|
-
pub dimension_clamped: bool,
|
|
946
|
-
/// Calculated optimal DPI (if auto_adjust_dpi enabled)
|
|
947
|
-
pub calculated_dpi: Option<i32>,
|
|
948
|
-
/// Whether resize was skipped (dimensions already optimal)
|
|
949
|
-
pub skipped_resize: bool,
|
|
950
|
-
/// Error message if resize failed
|
|
951
|
-
pub resize_error: Option<String>,
|
|
952
|
-
}
|
|
953
|
-
|
|
954
|
-
/// Image extraction configuration (internal use).
|
|
955
|
-
///
|
|
956
|
-
/// **Note:** This is an internal type used for image preprocessing.
|
|
957
|
-
/// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
|
|
958
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
959
|
-
pub struct ExtractionConfig {
|
|
960
|
-
/// Target DPI for image normalization
|
|
961
|
-
pub target_dpi: i32,
|
|
962
|
-
/// Maximum image dimension (width or height)
|
|
963
|
-
pub max_image_dimension: i32,
|
|
964
|
-
/// Whether to auto-adjust DPI based on content
|
|
965
|
-
pub auto_adjust_dpi: bool,
|
|
966
|
-
/// Minimum DPI threshold
|
|
967
|
-
pub min_dpi: i32,
|
|
968
|
-
/// Maximum DPI threshold
|
|
969
|
-
pub max_dpi: i32,
|
|
970
|
-
}
|
|
971
|
-
|
|
972
|
-
impl Default for ExtractionConfig {
|
|
973
|
-
fn default() -> Self {
|
|
974
|
-
Self {
|
|
975
|
-
target_dpi: 300,
|
|
976
|
-
max_image_dimension: 4096,
|
|
977
|
-
auto_adjust_dpi: true,
|
|
978
|
-
min_dpi: 72,
|
|
979
|
-
max_dpi: 600,
|
|
980
|
-
}
|
|
981
|
-
}
|
|
982
|
-
}
|
|
983
|
-
|
|
984
|
-
/// Cache statistics.
|
|
985
|
-
///
|
|
986
|
-
/// Provides information about the extraction result cache,
|
|
987
|
-
/// including size, file count, and age distribution.
|
|
988
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
989
|
-
pub struct CacheStats {
|
|
990
|
-
/// Total number of cached files
|
|
991
|
-
pub total_files: usize,
|
|
992
|
-
/// Total cache size in megabytes
|
|
993
|
-
pub total_size_mb: f64,
|
|
994
|
-
/// Available disk space in megabytes
|
|
995
|
-
pub available_space_mb: f64,
|
|
996
|
-
/// Age of the oldest cached file in days
|
|
997
|
-
pub oldest_file_age_days: f64,
|
|
998
|
-
/// Age of the newest cached file in days
|
|
999
|
-
pub newest_file_age_days: f64,
|
|
1000
|
-
}
|
|
1001
|
-
|
|
1002
|
-
/// LibreOffice conversion result.
|
|
1003
|
-
///
|
|
1004
|
-
/// Result of converting a legacy office document (e.g., .doc, .ppt)
|
|
1005
|
-
/// to a modern format using LibreOffice.
|
|
1006
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1007
|
-
pub struct LibreOfficeConversionResult {
|
|
1008
|
-
/// Converted file bytes
|
|
1009
|
-
pub converted_bytes: Vec<u8>,
|
|
1010
|
-
/// Original format identifier
|
|
1011
|
-
pub original_format: String,
|
|
1012
|
-
/// Target format identifier
|
|
1013
|
-
pub target_format: String,
|
|
1014
|
-
/// Target MIME type after conversion
|
|
1015
|
-
pub target_mime: String,
|
|
1016
|
-
}
|
|
1017
|
-
|
|
1018
|
-
#[cfg(test)]
|
|
1019
|
-
mod tests {
|
|
1020
|
-
use super::*;
|
|
1021
|
-
|
|
1022
|
-
#[test]
|
|
1023
|
-
fn test_metadata_serialization_with_format() {
|
|
1024
|
-
let mut metadata = Metadata {
|
|
1025
|
-
format: Some(FormatMetadata::Text(TextMetadata {
|
|
1026
|
-
line_count: 1,
|
|
1027
|
-
word_count: 2,
|
|
1028
|
-
character_count: 13,
|
|
1029
|
-
headers: None,
|
|
1030
|
-
links: None,
|
|
1031
|
-
code_blocks: None,
|
|
1032
|
-
})),
|
|
1033
|
-
..Default::default()
|
|
1034
|
-
};
|
|
1035
|
-
|
|
1036
|
-
metadata
|
|
1037
|
-
.additional
|
|
1038
|
-
.insert("quality_score".to_string(), serde_json::json!(1.0));
|
|
1039
|
-
|
|
1040
|
-
let json = serde_json::to_value(&metadata).unwrap();
|
|
1041
|
-
println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
|
|
1042
|
-
|
|
1043
|
-
assert!(
|
|
1044
|
-
json.get("format_type").is_some(),
|
|
1045
|
-
"format_type should be present in serialized JSON"
|
|
1046
|
-
);
|
|
1047
|
-
assert_eq!(json.get("format_type").unwrap(), "text");
|
|
1048
|
-
|
|
1049
|
-
assert_eq!(json.get("line_count").unwrap(), 1);
|
|
1050
|
-
assert_eq!(json.get("word_count").unwrap(), 2);
|
|
1051
|
-
assert_eq!(json.get("character_count").unwrap(), 13);
|
|
1052
|
-
|
|
1053
|
-
assert_eq!(json.get("quality_score").unwrap(), 1.0);
|
|
1054
|
-
}
|
|
1055
|
-
}
|
|
1
|
+
use serde::{Deserialize, Serialize};
|
|
2
|
+
use std::collections::HashMap;
|
|
3
|
+
|
|
4
|
+
#[cfg(feature = "pdf")]
|
|
5
|
+
use crate::pdf::metadata::PdfMetadata;
|
|
6
|
+
|
|
7
|
+
// ============================================================================
|
|
8
|
+
// ============================================================================
|
|
9
|
+
|
|
10
|
+
/// General extraction result used by the core extraction API.
|
|
11
|
+
///
|
|
12
|
+
/// This is the main result type returned by all extraction functions.
|
|
13
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
14
|
+
pub struct ExtractionResult {
|
|
15
|
+
pub content: String,
|
|
16
|
+
pub mime_type: String,
|
|
17
|
+
pub metadata: Metadata,
|
|
18
|
+
pub tables: Vec<Table>,
|
|
19
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
20
|
+
pub detected_languages: Option<Vec<String>>,
|
|
21
|
+
|
|
22
|
+
/// Text chunks when chunking is enabled.
|
|
23
|
+
///
|
|
24
|
+
/// When chunking configuration is provided, the content is split into
|
|
25
|
+
/// overlapping chunks for efficient processing. Each chunk contains the text,
|
|
26
|
+
/// optional embeddings (if enabled), and metadata about its position.
|
|
27
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
28
|
+
pub chunks: Option<Vec<Chunk>>,
|
|
29
|
+
|
|
30
|
+
/// Extracted images from the document.
|
|
31
|
+
///
|
|
32
|
+
/// When image extraction is enabled via `ImageExtractionConfig`, this field
|
|
33
|
+
/// contains all images found in the document with their raw data and metadata.
|
|
34
|
+
/// Each image may optionally contain a nested `ocr_result` if OCR was performed.
|
|
35
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
36
|
+
pub images: Option<Vec<ExtractedImage>>,
|
|
37
|
+
|
|
38
|
+
/// Per-page content when page extraction is enabled.
|
|
39
|
+
///
|
|
40
|
+
/// When page extraction is configured, the document is split into per-page content
|
|
41
|
+
/// with tables and images mapped to their respective pages.
|
|
42
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
43
|
+
pub pages: Option<Vec<PageContent>>,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/// Format-specific metadata (discriminated union).
|
|
47
|
+
///
|
|
48
|
+
/// Only one format type can exist per extraction result. This provides
|
|
49
|
+
/// type-safe, clean metadata without nested optionals.
|
|
50
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
51
|
+
#[serde(tag = "format_type", rename_all = "snake_case")]
|
|
52
|
+
pub enum FormatMetadata {
|
|
53
|
+
#[cfg(feature = "pdf")]
|
|
54
|
+
Pdf(PdfMetadata),
|
|
55
|
+
Excel(ExcelMetadata),
|
|
56
|
+
Email(EmailMetadata),
|
|
57
|
+
Pptx(PptxMetadata),
|
|
58
|
+
Archive(ArchiveMetadata),
|
|
59
|
+
Image(ImageMetadata),
|
|
60
|
+
Xml(XmlMetadata),
|
|
61
|
+
Text(TextMetadata),
|
|
62
|
+
Html(Box<HtmlMetadata>),
|
|
63
|
+
Ocr(OcrMetadata),
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/// Extraction result metadata.
|
|
67
|
+
///
|
|
68
|
+
/// Contains common fields applicable to all formats, format-specific metadata
|
|
69
|
+
/// via a discriminated union, and additional custom fields from postprocessors.
|
|
70
|
+
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
71
|
+
pub struct Metadata {
|
|
72
|
+
/// Document title
|
|
73
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
74
|
+
pub title: Option<String>,
|
|
75
|
+
|
|
76
|
+
/// Document subject or description
|
|
77
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
78
|
+
pub subject: Option<String>,
|
|
79
|
+
|
|
80
|
+
/// Primary author(s) - always Vec for consistency
|
|
81
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
82
|
+
pub authors: Option<Vec<String>>,
|
|
83
|
+
|
|
84
|
+
/// Keywords/tags - always Vec for consistency
|
|
85
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
86
|
+
pub keywords: Option<Vec<String>>,
|
|
87
|
+
|
|
88
|
+
/// Primary language (ISO 639 code)
|
|
89
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
90
|
+
pub language: Option<String>,
|
|
91
|
+
|
|
92
|
+
/// Creation timestamp (ISO 8601 format)
|
|
93
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
94
|
+
pub created_at: Option<String>,
|
|
95
|
+
|
|
96
|
+
/// Last modification timestamp (ISO 8601 format)
|
|
97
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
98
|
+
pub modified_at: Option<String>,
|
|
99
|
+
|
|
100
|
+
/// User who created the document
|
|
101
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
102
|
+
pub created_by: Option<String>,
|
|
103
|
+
|
|
104
|
+
/// User who last modified the document
|
|
105
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
106
|
+
pub modified_by: Option<String>,
|
|
107
|
+
|
|
108
|
+
/// Page/slide/sheet structure with boundaries
|
|
109
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
110
|
+
pub pages: Option<PageStructure>,
|
|
111
|
+
|
|
112
|
+
/// Document date (DEPRECATED - use created_at/modified_at instead)
|
|
113
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
114
|
+
pub date: Option<String>,
|
|
115
|
+
|
|
116
|
+
/// Format-specific metadata (discriminated union)
|
|
117
|
+
///
|
|
118
|
+
/// Contains detailed metadata specific to the document format.
|
|
119
|
+
/// Serializes with a `format_type` discriminator field.
|
|
120
|
+
#[serde(flatten, skip_serializing_if = "Option::is_none")]
|
|
121
|
+
pub format: Option<FormatMetadata>,
|
|
122
|
+
|
|
123
|
+
/// Image preprocessing metadata (when OCR preprocessing was applied)
|
|
124
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
125
|
+
pub image_preprocessing: Option<ImagePreprocessingMetadata>,
|
|
126
|
+
|
|
127
|
+
/// JSON schema (for structured data extraction)
|
|
128
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
129
|
+
pub json_schema: Option<serde_json::Value>,
|
|
130
|
+
|
|
131
|
+
/// Error metadata (for batch operations)
|
|
132
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
133
|
+
pub error: Option<ErrorMetadata>,
|
|
134
|
+
|
|
135
|
+
/// Additional custom fields from postprocessors.
|
|
136
|
+
///
|
|
137
|
+
/// This flattened HashMap allows Python/TypeScript postprocessors to add
|
|
138
|
+
/// arbitrary fields (entity extraction, keyword extraction, etc.).
|
|
139
|
+
/// Fields are merged at the root level during serialization.
|
|
140
|
+
#[serde(flatten)]
|
|
141
|
+
pub additional: HashMap<String, serde_json::Value>,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/// Unified page structure for documents.
|
|
145
|
+
///
|
|
146
|
+
/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
|
147
|
+
/// with character offset boundaries for chunk-to-page mapping.
|
|
148
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
149
|
+
pub struct PageStructure {
|
|
150
|
+
/// Total number of pages/slides/sheets
|
|
151
|
+
pub total_count: usize,
|
|
152
|
+
|
|
153
|
+
/// Type of paginated unit
|
|
154
|
+
pub unit_type: PageUnitType,
|
|
155
|
+
|
|
156
|
+
/// Character offset boundaries for each page
|
|
157
|
+
///
|
|
158
|
+
/// Maps character ranges in the extracted content to page numbers.
|
|
159
|
+
/// Used for chunk page range calculation.
|
|
160
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
161
|
+
pub boundaries: Option<Vec<PageBoundary>>,
|
|
162
|
+
|
|
163
|
+
/// Detailed per-page metadata (optional, only when needed)
|
|
164
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
165
|
+
pub pages: Option<Vec<PageInfo>>,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/// Type of paginated unit in a document.
|
|
169
|
+
///
|
|
170
|
+
/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
|
|
171
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
172
|
+
#[serde(rename_all = "snake_case")]
|
|
173
|
+
pub enum PageUnitType {
|
|
174
|
+
/// Standard document pages (PDF, DOCX, images)
|
|
175
|
+
Page,
|
|
176
|
+
/// Presentation slides (PPTX, ODP)
|
|
177
|
+
Slide,
|
|
178
|
+
/// Spreadsheet sheets (XLSX, ODS)
|
|
179
|
+
Sheet,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/// Byte offset boundary for a page.
|
|
183
|
+
///
|
|
184
|
+
/// Tracks where a specific page's content starts and ends in the main content string,
|
|
185
|
+
/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|
186
|
+
/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|
187
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
188
|
+
pub struct PageBoundary {
|
|
189
|
+
/// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
|
190
|
+
pub byte_start: usize,
|
|
191
|
+
/// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
|
|
192
|
+
pub byte_end: usize,
|
|
193
|
+
/// Page number (1-indexed)
|
|
194
|
+
pub page_number: usize,
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/// Metadata for individual page/slide/sheet.
|
|
198
|
+
///
|
|
199
|
+
/// Captures per-page information including dimensions, content counts,
|
|
200
|
+
/// and visibility state (for presentations).
|
|
201
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
202
|
+
pub struct PageInfo {
|
|
203
|
+
/// Page number (1-indexed)
|
|
204
|
+
pub number: usize,
|
|
205
|
+
|
|
206
|
+
/// Page title (usually for presentations)
|
|
207
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
208
|
+
pub title: Option<String>,
|
|
209
|
+
|
|
210
|
+
/// Dimensions in points (PDF) or pixels (images): (width, height)
|
|
211
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
212
|
+
pub dimensions: Option<(f64, f64)>,
|
|
213
|
+
|
|
214
|
+
/// Number of images on this page
|
|
215
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
216
|
+
pub image_count: Option<usize>,
|
|
217
|
+
|
|
218
|
+
/// Number of tables on this page
|
|
219
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
220
|
+
pub table_count: Option<usize>,
|
|
221
|
+
|
|
222
|
+
/// Whether this page is hidden (e.g., in presentations)
|
|
223
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
224
|
+
pub hidden: Option<bool>,
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/// Content for a single page/slide.
|
|
228
|
+
///
|
|
229
|
+
/// When page extraction is enabled, documents are split into per-page content
|
|
230
|
+
/// with associated tables and images mapped to each page.
|
|
231
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
232
|
+
pub struct PageContent {
|
|
233
|
+
/// Page number (1-indexed)
|
|
234
|
+
pub page_number: usize,
|
|
235
|
+
|
|
236
|
+
/// Text content for this page
|
|
237
|
+
pub content: String,
|
|
238
|
+
|
|
239
|
+
/// Tables found on this page
|
|
240
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
241
|
+
pub tables: Vec<Table>,
|
|
242
|
+
|
|
243
|
+
/// Images found on this page
|
|
244
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
245
|
+
pub images: Vec<ExtractedImage>,
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/// Excel/spreadsheet metadata.
|
|
249
|
+
///
|
|
250
|
+
/// Contains information about sheets in Excel, LibreOffice Calc, and other
|
|
251
|
+
/// spreadsheet formats (.xlsx, .xls, .ods, etc.).
|
|
252
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
253
|
+
pub struct ExcelMetadata {
|
|
254
|
+
/// Total number of sheets in the workbook
|
|
255
|
+
pub sheet_count: usize,
|
|
256
|
+
/// Names of all sheets in order
|
|
257
|
+
pub sheet_names: Vec<String>,
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/// Email metadata extracted from .eml and .msg files.
|
|
261
|
+
///
|
|
262
|
+
/// Includes sender/recipient information, message ID, and attachment list.
|
|
263
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
264
|
+
pub struct EmailMetadata {
|
|
265
|
+
/// Sender's email address
|
|
266
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
267
|
+
pub from_email: Option<String>,
|
|
268
|
+
|
|
269
|
+
/// Sender's display name
|
|
270
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
271
|
+
pub from_name: Option<String>,
|
|
272
|
+
|
|
273
|
+
/// Primary recipients
|
|
274
|
+
pub to_emails: Vec<String>,
|
|
275
|
+
/// CC recipients
|
|
276
|
+
pub cc_emails: Vec<String>,
|
|
277
|
+
/// BCC recipients
|
|
278
|
+
pub bcc_emails: Vec<String>,
|
|
279
|
+
|
|
280
|
+
/// Message-ID header value
|
|
281
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
282
|
+
pub message_id: Option<String>,
|
|
283
|
+
|
|
284
|
+
/// List of attachment filenames
|
|
285
|
+
pub attachments: Vec<String>,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/// Archive (ZIP/TAR/7Z) metadata.
|
|
289
|
+
///
|
|
290
|
+
/// Extracted from compressed archive files containing file lists and size information.
|
|
291
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
292
|
+
pub struct ArchiveMetadata {
|
|
293
|
+
/// Archive format ("ZIP", "TAR", "7Z", etc.)
|
|
294
|
+
pub format: String,
|
|
295
|
+
/// Total number of files in the archive
|
|
296
|
+
pub file_count: usize,
|
|
297
|
+
/// List of file paths within the archive
|
|
298
|
+
pub file_list: Vec<String>,
|
|
299
|
+
/// Total uncompressed size in bytes
|
|
300
|
+
pub total_size: usize,
|
|
301
|
+
|
|
302
|
+
/// Compressed size in bytes (if available)
|
|
303
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
304
|
+
pub compressed_size: Option<usize>,
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/// Image metadata extracted from image files.
|
|
308
|
+
///
|
|
309
|
+
/// Includes dimensions, format, and EXIF data.
|
|
310
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
311
|
+
pub struct ImageMetadata {
|
|
312
|
+
/// Image width in pixels
|
|
313
|
+
pub width: u32,
|
|
314
|
+
/// Image height in pixels
|
|
315
|
+
pub height: u32,
|
|
316
|
+
/// Image format (e.g., "PNG", "JPEG", "TIFF")
|
|
317
|
+
pub format: String,
|
|
318
|
+
/// EXIF metadata tags
|
|
319
|
+
pub exif: HashMap<String, String>,
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
/// XML metadata extracted during XML parsing.
|
|
323
|
+
///
|
|
324
|
+
/// Provides statistics about XML document structure.
|
|
325
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
326
|
+
pub struct XmlMetadata {
|
|
327
|
+
/// Total number of XML elements processed
|
|
328
|
+
pub element_count: usize,
|
|
329
|
+
/// List of unique element tag names (sorted)
|
|
330
|
+
pub unique_elements: Vec<String>,
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/// Text/Markdown metadata.
|
|
334
|
+
///
|
|
335
|
+
/// Extracted from plain text and Markdown files. Includes word counts and,
|
|
336
|
+
/// for Markdown, structural elements like headers and links.
|
|
337
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
338
|
+
pub struct TextMetadata {
|
|
339
|
+
/// Number of lines in the document
|
|
340
|
+
pub line_count: usize,
|
|
341
|
+
/// Number of words
|
|
342
|
+
pub word_count: usize,
|
|
343
|
+
/// Number of characters
|
|
344
|
+
pub character_count: usize,
|
|
345
|
+
|
|
346
|
+
/// Markdown headers (headings text only, for Markdown files)
|
|
347
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
348
|
+
pub headers: Option<Vec<String>>,
|
|
349
|
+
|
|
350
|
+
/// Markdown links as (text, url) tuples (for Markdown files)
|
|
351
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
352
|
+
pub links: Option<Vec<(String, String)>>,
|
|
353
|
+
|
|
354
|
+
/// Code blocks as (language, code) tuples (for Markdown files)
|
|
355
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
356
|
+
pub code_blocks: Option<Vec<(String, String)>>,
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/// HTML metadata extracted from HTML documents.
|
|
360
|
+
///
|
|
361
|
+
/// Includes meta tags, Open Graph data, Twitter Card metadata, and link relations.
|
|
362
|
+
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
363
|
+
pub struct HtmlMetadata {
|
|
364
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
365
|
+
pub title: Option<String>,
|
|
366
|
+
|
|
367
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
368
|
+
pub description: Option<String>,
|
|
369
|
+
|
|
370
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
371
|
+
pub keywords: Option<String>,
|
|
372
|
+
|
|
373
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
374
|
+
pub author: Option<String>,
|
|
375
|
+
|
|
376
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
377
|
+
pub canonical: Option<String>,
|
|
378
|
+
|
|
379
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
380
|
+
pub base_href: Option<String>,
|
|
381
|
+
|
|
382
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
383
|
+
pub og_title: Option<String>,
|
|
384
|
+
|
|
385
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
386
|
+
pub og_description: Option<String>,
|
|
387
|
+
|
|
388
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
389
|
+
pub og_image: Option<String>,
|
|
390
|
+
|
|
391
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
392
|
+
pub og_url: Option<String>,
|
|
393
|
+
|
|
394
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
395
|
+
pub og_type: Option<String>,
|
|
396
|
+
|
|
397
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
398
|
+
pub og_site_name: Option<String>,
|
|
399
|
+
|
|
400
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
401
|
+
pub twitter_card: Option<String>,
|
|
402
|
+
|
|
403
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
404
|
+
pub twitter_title: Option<String>,
|
|
405
|
+
|
|
406
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
407
|
+
pub twitter_description: Option<String>,
|
|
408
|
+
|
|
409
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
410
|
+
pub twitter_image: Option<String>,
|
|
411
|
+
|
|
412
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
413
|
+
pub twitter_site: Option<String>,
|
|
414
|
+
|
|
415
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
416
|
+
pub twitter_creator: Option<String>,
|
|
417
|
+
|
|
418
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
419
|
+
pub link_author: Option<String>,
|
|
420
|
+
|
|
421
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
422
|
+
pub link_license: Option<String>,
|
|
423
|
+
|
|
424
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
425
|
+
pub link_alternate: Option<String>,
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
/// OCR processing metadata.
|
|
429
|
+
///
|
|
430
|
+
/// Captures information about OCR processing configuration and results.
|
|
431
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
432
|
+
pub struct OcrMetadata {
|
|
433
|
+
/// OCR language code(s) used
|
|
434
|
+
pub language: String,
|
|
435
|
+
/// Tesseract Page Segmentation Mode (PSM)
|
|
436
|
+
pub psm: i32,
|
|
437
|
+
/// Output format (e.g., "text", "hocr")
|
|
438
|
+
pub output_format: String,
|
|
439
|
+
/// Number of tables detected
|
|
440
|
+
pub table_count: usize,
|
|
441
|
+
|
|
442
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
443
|
+
pub table_rows: Option<usize>,
|
|
444
|
+
|
|
445
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
446
|
+
pub table_cols: Option<usize>,
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
/// Error metadata (for batch operations).
|
|
450
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
451
|
+
pub struct ErrorMetadata {
|
|
452
|
+
pub error_type: String,
|
|
453
|
+
pub message: String,
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
/// Extracted table structure.
|
|
457
|
+
///
|
|
458
|
+
/// Represents a table detected and extracted from a document (PDF, image, etc.).
|
|
459
|
+
/// Tables are converted to both structured cell data and Markdown format.
|
|
460
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
461
|
+
pub struct Table {
|
|
462
|
+
/// Table cells as a 2D vector (rows × columns)
|
|
463
|
+
pub cells: Vec<Vec<String>>,
|
|
464
|
+
/// Markdown representation of the table
|
|
465
|
+
pub markdown: String,
|
|
466
|
+
/// Page number where the table was found (1-indexed)
|
|
467
|
+
pub page_number: usize,
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
/// A text chunk with optional embedding and metadata.
|
|
471
|
+
///
|
|
472
|
+
/// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
|
473
|
+
/// contains the text content, optional embedding vector (if embedding generation
|
|
474
|
+
/// is configured), and metadata about its position in the document.
|
|
475
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
476
|
+
pub struct Chunk {
|
|
477
|
+
/// The text content of this chunk.
|
|
478
|
+
pub content: String,
|
|
479
|
+
|
|
480
|
+
/// Optional embedding vector for this chunk.
|
|
481
|
+
///
|
|
482
|
+
/// Only populated when `EmbeddingConfig` is provided in chunking configuration.
|
|
483
|
+
/// The dimensionality depends on the chosen embedding model.
|
|
484
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
485
|
+
pub embedding: Option<Vec<f32>>,
|
|
486
|
+
|
|
487
|
+
/// Metadata about this chunk's position and properties.
|
|
488
|
+
pub metadata: ChunkMetadata,
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/// Metadata about a chunk's position in the original document.
|
|
492
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
493
|
+
pub struct ChunkMetadata {
|
|
494
|
+
/// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
|
495
|
+
pub byte_start: usize,
|
|
496
|
+
|
|
497
|
+
/// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
|
|
498
|
+
pub byte_end: usize,
|
|
499
|
+
|
|
500
|
+
/// Number of tokens in this chunk (if available).
|
|
501
|
+
///
|
|
502
|
+
/// This is calculated by the embedding model's tokenizer if embeddings are enabled.
|
|
503
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
504
|
+
pub token_count: Option<usize>,
|
|
505
|
+
|
|
506
|
+
/// Zero-based index of this chunk in the document.
|
|
507
|
+
pub chunk_index: usize,
|
|
508
|
+
|
|
509
|
+
/// Total number of chunks in the document.
|
|
510
|
+
pub total_chunks: usize,
|
|
511
|
+
|
|
512
|
+
/// First page number this chunk spans (1-indexed).
|
|
513
|
+
///
|
|
514
|
+
/// Only populated when page tracking is enabled in extraction configuration.
|
|
515
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
516
|
+
pub first_page: Option<usize>,
|
|
517
|
+
|
|
518
|
+
/// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
|
519
|
+
///
|
|
520
|
+
/// Only populated when page tracking is enabled in extraction configuration.
|
|
521
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
522
|
+
pub last_page: Option<usize>,
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
/// Extracted image from a document.
|
|
526
|
+
///
|
|
527
|
+
/// Contains raw image data, metadata, and optional nested OCR results.
|
|
528
|
+
/// Raw bytes allow cross-language compatibility - users can convert to
|
|
529
|
+
/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
|
530
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
531
|
+
pub struct ExtractedImage {
|
|
532
|
+
/// Raw image data (PNG, JPEG, WebP, etc. bytes)
|
|
533
|
+
pub data: Vec<u8>,
|
|
534
|
+
|
|
535
|
+
/// Image format (e.g., "jpeg", "png", "webp")
|
|
536
|
+
pub format: String,
|
|
537
|
+
|
|
538
|
+
/// Zero-indexed position of this image in the document/page
|
|
539
|
+
pub image_index: usize,
|
|
540
|
+
|
|
541
|
+
/// Page/slide number where image was found (1-indexed)
|
|
542
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
543
|
+
pub page_number: Option<usize>,
|
|
544
|
+
|
|
545
|
+
/// Image width in pixels
|
|
546
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
547
|
+
pub width: Option<u32>,
|
|
548
|
+
|
|
549
|
+
/// Image height in pixels
|
|
550
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
551
|
+
pub height: Option<u32>,
|
|
552
|
+
|
|
553
|
+
/// Colorspace information (e.g., "RGB", "CMYK", "Gray")
|
|
554
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
555
|
+
pub colorspace: Option<String>,
|
|
556
|
+
|
|
557
|
+
/// Bits per color component (e.g., 8, 16)
|
|
558
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
559
|
+
pub bits_per_component: Option<u32>,
|
|
560
|
+
|
|
561
|
+
/// Whether this image is a mask image
|
|
562
|
+
#[serde(default)]
|
|
563
|
+
pub is_mask: bool,
|
|
564
|
+
|
|
565
|
+
/// Optional description of the image
|
|
566
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
567
|
+
pub description: Option<String>,
|
|
568
|
+
|
|
569
|
+
/// Nested OCR extraction result (if image was OCRed)
|
|
570
|
+
///
|
|
571
|
+
/// When OCR is performed on this image, the result is embedded here
|
|
572
|
+
/// rather than in a separate collection, making the relationship explicit.
|
|
573
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
574
|
+
pub ocr_result: Option<Box<ExtractionResult>>,
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
/// Excel workbook representation.
|
|
578
|
+
///
|
|
579
|
+
/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
|
580
|
+
/// extracted content and metadata.
|
|
581
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
582
|
+
pub struct ExcelWorkbook {
|
|
583
|
+
/// All sheets in the workbook
|
|
584
|
+
pub sheets: Vec<ExcelSheet>,
|
|
585
|
+
/// Workbook-level metadata (author, creation date, etc.)
|
|
586
|
+
pub metadata: HashMap<String, String>,
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
/// Single Excel worksheet.
|
|
590
|
+
///
|
|
591
|
+
/// Represents one sheet from an Excel workbook with its content
|
|
592
|
+
/// converted to Markdown format and dimensional statistics.
|
|
593
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
594
|
+
pub struct ExcelSheet {
|
|
595
|
+
/// Sheet name as it appears in Excel
|
|
596
|
+
pub name: String,
|
|
597
|
+
/// Sheet content converted to Markdown tables
|
|
598
|
+
pub markdown: String,
|
|
599
|
+
/// Number of rows
|
|
600
|
+
pub row_count: usize,
|
|
601
|
+
/// Number of columns
|
|
602
|
+
pub col_count: usize,
|
|
603
|
+
/// Total number of non-empty cells
|
|
604
|
+
pub cell_count: usize,
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
/// XML extraction result.
|
|
608
|
+
///
|
|
609
|
+
/// Contains extracted text content from XML files along with
|
|
610
|
+
/// structural statistics about the XML document.
|
|
611
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
612
|
+
pub struct XmlExtractionResult {
|
|
613
|
+
/// Extracted text content (XML structure filtered out)
|
|
614
|
+
pub content: String,
|
|
615
|
+
/// Total number of XML elements processed
|
|
616
|
+
pub element_count: usize,
|
|
617
|
+
/// List of unique element names found (sorted)
|
|
618
|
+
pub unique_elements: Vec<String>,
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
/// Plain text and Markdown extraction result.
|
|
622
|
+
///
|
|
623
|
+
/// Contains the extracted text along with statistics and,
|
|
624
|
+
/// for Markdown files, structural elements like headers and links.
|
|
625
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
626
|
+
pub struct TextExtractionResult {
|
|
627
|
+
/// Extracted text content
|
|
628
|
+
pub content: String,
|
|
629
|
+
/// Number of lines
|
|
630
|
+
pub line_count: usize,
|
|
631
|
+
/// Number of words
|
|
632
|
+
pub word_count: usize,
|
|
633
|
+
/// Number of characters
|
|
634
|
+
pub character_count: usize,
|
|
635
|
+
/// Markdown headers (text only, Markdown files only)
|
|
636
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
637
|
+
pub headers: Option<Vec<String>>,
|
|
638
|
+
/// Markdown links as (text, URL) tuples (Markdown files only)
|
|
639
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
640
|
+
pub links: Option<Vec<(String, String)>>,
|
|
641
|
+
/// Code blocks as (language, code) tuples (Markdown files only)
|
|
642
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
643
|
+
pub code_blocks: Option<Vec<(String, String)>>,
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/// PowerPoint (PPTX) extraction result.
|
|
647
|
+
///
|
|
648
|
+
/// Contains extracted slide content, metadata, and embedded images/tables.
|
|
649
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
650
|
+
pub struct PptxExtractionResult {
|
|
651
|
+
/// Extracted text content from all slides
|
|
652
|
+
pub content: String,
|
|
653
|
+
/// Presentation metadata
|
|
654
|
+
pub metadata: PptxMetadata,
|
|
655
|
+
/// Total number of slides
|
|
656
|
+
pub slide_count: usize,
|
|
657
|
+
/// Total number of embedded images
|
|
658
|
+
pub image_count: usize,
|
|
659
|
+
/// Total number of tables
|
|
660
|
+
pub table_count: usize,
|
|
661
|
+
/// Extracted images from the presentation
|
|
662
|
+
pub images: Vec<ExtractedImage>,
|
|
663
|
+
/// Slide structure with boundaries (when page tracking is enabled)
|
|
664
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
665
|
+
pub page_structure: Option<PageStructure>,
|
|
666
|
+
/// Per-slide content (when page tracking is enabled)
|
|
667
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
668
|
+
pub page_contents: Option<Vec<PageContent>>,
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
/// PowerPoint presentation metadata.
|
|
672
|
+
///
|
|
673
|
+
/// Contains PPTX-specific metadata. Common fields like title, author, and description
|
|
674
|
+
/// are now in the base `Metadata` struct.
|
|
675
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
676
|
+
pub struct PptxMetadata {
|
|
677
|
+
/// List of fonts used in the presentation
|
|
678
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
679
|
+
pub fonts: Vec<String>,
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
/// Email extraction result.
|
|
683
|
+
///
|
|
684
|
+
/// Complete representation of an extracted email message (.eml or .msg)
|
|
685
|
+
/// including headers, body content, and attachments.
|
|
686
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
687
|
+
pub struct EmailExtractionResult {
|
|
688
|
+
/// Email subject line
|
|
689
|
+
pub subject: Option<String>,
|
|
690
|
+
/// Sender email address
|
|
691
|
+
pub from_email: Option<String>,
|
|
692
|
+
/// Primary recipient email addresses
|
|
693
|
+
pub to_emails: Vec<String>,
|
|
694
|
+
/// CC recipient email addresses
|
|
695
|
+
pub cc_emails: Vec<String>,
|
|
696
|
+
/// BCC recipient email addresses
|
|
697
|
+
pub bcc_emails: Vec<String>,
|
|
698
|
+
/// Email date/timestamp
|
|
699
|
+
pub date: Option<String>,
|
|
700
|
+
/// Message-ID header value
|
|
701
|
+
pub message_id: Option<String>,
|
|
702
|
+
/// Plain text version of the email body
|
|
703
|
+
pub plain_text: Option<String>,
|
|
704
|
+
/// HTML version of the email body
|
|
705
|
+
pub html_content: Option<String>,
|
|
706
|
+
/// Cleaned/processed text content
|
|
707
|
+
pub cleaned_text: String,
|
|
708
|
+
/// List of email attachments
|
|
709
|
+
pub attachments: Vec<EmailAttachment>,
|
|
710
|
+
/// Additional email headers and metadata
|
|
711
|
+
pub metadata: HashMap<String, String>,
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
/// Email attachment representation.
|
|
715
|
+
///
|
|
716
|
+
/// Contains metadata and optionally the content of an email attachment.
|
|
717
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
718
|
+
pub struct EmailAttachment {
|
|
719
|
+
/// Attachment name (from Content-Disposition header)
|
|
720
|
+
pub name: Option<String>,
|
|
721
|
+
/// Filename of the attachment
|
|
722
|
+
pub filename: Option<String>,
|
|
723
|
+
/// MIME type of the attachment
|
|
724
|
+
pub mime_type: Option<String>,
|
|
725
|
+
/// Size in bytes
|
|
726
|
+
pub size: Option<usize>,
|
|
727
|
+
/// Whether this attachment is an image
|
|
728
|
+
pub is_image: bool,
|
|
729
|
+
/// Attachment data (if extracted)
|
|
730
|
+
pub data: Option<Vec<u8>>,
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
/// OCR extraction result.
|
|
734
|
+
///
|
|
735
|
+
/// Result of performing OCR on an image or scanned document,
|
|
736
|
+
/// including recognized text and detected tables.
|
|
737
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
738
|
+
pub struct OcrExtractionResult {
|
|
739
|
+
/// Recognized text content
|
|
740
|
+
pub content: String,
|
|
741
|
+
/// Original MIME type of the processed image
|
|
742
|
+
pub mime_type: String,
|
|
743
|
+
/// OCR processing metadata (confidence scores, language, etc.)
|
|
744
|
+
pub metadata: HashMap<String, serde_json::Value>,
|
|
745
|
+
/// Tables detected and extracted via OCR
|
|
746
|
+
pub tables: Vec<OcrTable>,
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
/// Table detected via OCR.
|
|
750
|
+
///
|
|
751
|
+
/// Represents a table structure recognized during OCR processing.
|
|
752
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
753
|
+
pub struct OcrTable {
|
|
754
|
+
/// Table cells as a 2D vector (rows × columns)
|
|
755
|
+
pub cells: Vec<Vec<String>>,
|
|
756
|
+
/// Markdown representation of the table
|
|
757
|
+
pub markdown: String,
|
|
758
|
+
/// Page number where the table was found (1-indexed)
|
|
759
|
+
pub page_number: usize,
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
/// Image preprocessing configuration for OCR.
|
|
763
|
+
///
|
|
764
|
+
/// These settings control how images are preprocessed before OCR to improve
|
|
765
|
+
/// text recognition quality. Different preprocessing strategies work better
|
|
766
|
+
/// for different document types.
|
|
767
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
768
|
+
#[serde(default)]
|
|
769
|
+
pub struct ImagePreprocessingConfig {
|
|
770
|
+
/// Target DPI for the image (300 is standard, 600 for small text).
|
|
771
|
+
pub target_dpi: i32,
|
|
772
|
+
|
|
773
|
+
/// Auto-detect and correct image rotation.
|
|
774
|
+
pub auto_rotate: bool,
|
|
775
|
+
|
|
776
|
+
/// Correct skew (tilted images).
|
|
777
|
+
pub deskew: bool,
|
|
778
|
+
|
|
779
|
+
/// Remove noise from the image.
|
|
780
|
+
pub denoise: bool,
|
|
781
|
+
|
|
782
|
+
/// Enhance contrast for better text visibility.
|
|
783
|
+
pub contrast_enhance: bool,
|
|
784
|
+
|
|
785
|
+
/// Binarization method: "otsu", "sauvola", "adaptive".
|
|
786
|
+
pub binarization_method: String,
|
|
787
|
+
|
|
788
|
+
/// Invert colors (white text on black → black on white).
|
|
789
|
+
pub invert_colors: bool,
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
impl Default for ImagePreprocessingConfig {
|
|
793
|
+
fn default() -> Self {
|
|
794
|
+
Self {
|
|
795
|
+
target_dpi: 300,
|
|
796
|
+
auto_rotate: true,
|
|
797
|
+
deskew: true,
|
|
798
|
+
denoise: false,
|
|
799
|
+
contrast_enhance: false,
|
|
800
|
+
binarization_method: "otsu".to_string(),
|
|
801
|
+
invert_colors: false,
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
/// Tesseract OCR configuration.
|
|
807
|
+
///
|
|
808
|
+
/// Provides fine-grained control over Tesseract OCR engine parameters.
|
|
809
|
+
/// Most users can use the defaults, but these settings allow optimization
|
|
810
|
+
/// for specific document types (invoices, handwriting, etc.).
|
|
811
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
812
|
+
#[serde(default)]
|
|
813
|
+
pub struct TesseractConfig {
|
|
814
|
+
/// Language code (e.g., "eng", "deu", "fra")
|
|
815
|
+
pub language: String,
|
|
816
|
+
|
|
817
|
+
/// Page Segmentation Mode (0-13).
|
|
818
|
+
///
|
|
819
|
+
/// Common values:
|
|
820
|
+
/// - 3: Fully automatic page segmentation (default)
|
|
821
|
+
/// - 6: Assume a single uniform block of text
|
|
822
|
+
/// - 11: Sparse text with no particular order
|
|
823
|
+
pub psm: i32,
|
|
824
|
+
|
|
825
|
+
/// Output format ("text" or "markdown")
|
|
826
|
+
pub output_format: String,
|
|
827
|
+
|
|
828
|
+
/// OCR Engine Mode (0-3).
|
|
829
|
+
///
|
|
830
|
+
/// - 0: Legacy engine only
|
|
831
|
+
/// - 1: Neural nets (LSTM) only (usually best)
|
|
832
|
+
/// - 2: Legacy + LSTM
|
|
833
|
+
/// - 3: Default (based on what's available)
|
|
834
|
+
pub oem: i32,
|
|
835
|
+
|
|
836
|
+
/// Minimum confidence threshold (0.0-100.0).
|
|
837
|
+
///
|
|
838
|
+
/// Words with confidence below this threshold may be rejected or flagged.
|
|
839
|
+
pub min_confidence: f64,
|
|
840
|
+
|
|
841
|
+
/// Image preprocessing configuration.
|
|
842
|
+
///
|
|
843
|
+
/// Controls how images are preprocessed before OCR. Can significantly
|
|
844
|
+
/// improve quality for scanned documents or low-quality images.
|
|
845
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
846
|
+
pub preprocessing: Option<ImagePreprocessingConfig>,
|
|
847
|
+
|
|
848
|
+
/// Enable automatic table detection and reconstruction
|
|
849
|
+
pub enable_table_detection: bool,
|
|
850
|
+
|
|
851
|
+
/// Minimum confidence threshold for table detection (0.0-1.0)
|
|
852
|
+
pub table_min_confidence: f64,
|
|
853
|
+
|
|
854
|
+
/// Column threshold for table detection (pixels)
|
|
855
|
+
pub table_column_threshold: i32,
|
|
856
|
+
|
|
857
|
+
/// Row threshold ratio for table detection (0.0-1.0)
|
|
858
|
+
pub table_row_threshold_ratio: f64,
|
|
859
|
+
|
|
860
|
+
/// Enable OCR result caching
|
|
861
|
+
pub use_cache: bool,
|
|
862
|
+
|
|
863
|
+
/// Use pre-adapted templates for character classification
|
|
864
|
+
pub classify_use_pre_adapted_templates: bool,
|
|
865
|
+
|
|
866
|
+
/// Enable N-gram language model
|
|
867
|
+
pub language_model_ngram_on: bool,
|
|
868
|
+
|
|
869
|
+
/// Don't reject good words during block-level processing
|
|
870
|
+
pub tessedit_dont_blkrej_good_wds: bool,
|
|
871
|
+
|
|
872
|
+
/// Don't reject good words during row-level processing
|
|
873
|
+
pub tessedit_dont_rowrej_good_wds: bool,
|
|
874
|
+
|
|
875
|
+
/// Enable dictionary correction
|
|
876
|
+
pub tessedit_enable_dict_correction: bool,
|
|
877
|
+
|
|
878
|
+
/// Whitelist of allowed characters (empty = all allowed)
|
|
879
|
+
pub tessedit_char_whitelist: String,
|
|
880
|
+
|
|
881
|
+
/// Blacklist of forbidden characters (empty = none forbidden)
|
|
882
|
+
pub tessedit_char_blacklist: String,
|
|
883
|
+
|
|
884
|
+
/// Use primary language params model
|
|
885
|
+
pub tessedit_use_primary_params_model: bool,
|
|
886
|
+
|
|
887
|
+
/// Variable-width space detection
|
|
888
|
+
pub textord_space_size_is_variable: bool,
|
|
889
|
+
|
|
890
|
+
/// Use adaptive thresholding method
|
|
891
|
+
pub thresholding_method: bool,
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
impl Default for TesseractConfig {
|
|
895
|
+
fn default() -> Self {
|
|
896
|
+
Self {
|
|
897
|
+
language: "eng".to_string(),
|
|
898
|
+
psm: 3,
|
|
899
|
+
output_format: "markdown".to_string(),
|
|
900
|
+
oem: 3,
|
|
901
|
+
min_confidence: 0.0,
|
|
902
|
+
preprocessing: None,
|
|
903
|
+
enable_table_detection: true,
|
|
904
|
+
table_min_confidence: 0.0,
|
|
905
|
+
table_column_threshold: 50,
|
|
906
|
+
table_row_threshold_ratio: 0.5,
|
|
907
|
+
use_cache: true,
|
|
908
|
+
classify_use_pre_adapted_templates: true,
|
|
909
|
+
language_model_ngram_on: false,
|
|
910
|
+
tessedit_dont_blkrej_good_wds: true,
|
|
911
|
+
tessedit_dont_rowrej_good_wds: true,
|
|
912
|
+
tessedit_enable_dict_correction: true,
|
|
913
|
+
tessedit_char_whitelist: String::new(),
|
|
914
|
+
tessedit_char_blacklist: String::new(),
|
|
915
|
+
tessedit_use_primary_params_model: true,
|
|
916
|
+
textord_space_size_is_variable: true,
|
|
917
|
+
thresholding_method: false,
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
/// Image preprocessing metadata.
|
|
923
|
+
///
|
|
924
|
+
/// Tracks the transformations applied to an image during OCR preprocessing,
|
|
925
|
+
/// including DPI normalization, resizing, and resampling.
|
|
926
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
927
|
+
pub struct ImagePreprocessingMetadata {
|
|
928
|
+
/// Original image dimensions (width, height) in pixels
|
|
929
|
+
pub original_dimensions: (usize, usize),
|
|
930
|
+
/// Original image DPI (horizontal, vertical)
|
|
931
|
+
pub original_dpi: (f64, f64),
|
|
932
|
+
/// Target DPI from configuration
|
|
933
|
+
pub target_dpi: i32,
|
|
934
|
+
/// Scaling factor applied to the image
|
|
935
|
+
pub scale_factor: f64,
|
|
936
|
+
/// Whether DPI was auto-adjusted based on content
|
|
937
|
+
pub auto_adjusted: bool,
|
|
938
|
+
/// Final DPI after processing
|
|
939
|
+
pub final_dpi: i32,
|
|
940
|
+
/// New dimensions after resizing (if resized)
|
|
941
|
+
pub new_dimensions: Option<(usize, usize)>,
|
|
942
|
+
/// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
|
|
943
|
+
pub resample_method: String,
|
|
944
|
+
/// Whether dimensions were clamped to max_image_dimension
|
|
945
|
+
pub dimension_clamped: bool,
|
|
946
|
+
/// Calculated optimal DPI (if auto_adjust_dpi enabled)
|
|
947
|
+
pub calculated_dpi: Option<i32>,
|
|
948
|
+
/// Whether resize was skipped (dimensions already optimal)
|
|
949
|
+
pub skipped_resize: bool,
|
|
950
|
+
/// Error message if resize failed
|
|
951
|
+
pub resize_error: Option<String>,
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
/// Image extraction configuration (internal use).
|
|
955
|
+
///
|
|
956
|
+
/// **Note:** This is an internal type used for image preprocessing.
|
|
957
|
+
/// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
|
|
958
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
959
|
+
pub struct ExtractionConfig {
|
|
960
|
+
/// Target DPI for image normalization
|
|
961
|
+
pub target_dpi: i32,
|
|
962
|
+
/// Maximum image dimension (width or height)
|
|
963
|
+
pub max_image_dimension: i32,
|
|
964
|
+
/// Whether to auto-adjust DPI based on content
|
|
965
|
+
pub auto_adjust_dpi: bool,
|
|
966
|
+
/// Minimum DPI threshold
|
|
967
|
+
pub min_dpi: i32,
|
|
968
|
+
/// Maximum DPI threshold
|
|
969
|
+
pub max_dpi: i32,
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
impl Default for ExtractionConfig {
|
|
973
|
+
fn default() -> Self {
|
|
974
|
+
Self {
|
|
975
|
+
target_dpi: 300,
|
|
976
|
+
max_image_dimension: 4096,
|
|
977
|
+
auto_adjust_dpi: true,
|
|
978
|
+
min_dpi: 72,
|
|
979
|
+
max_dpi: 600,
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
/// Cache statistics.
|
|
985
|
+
///
|
|
986
|
+
/// Provides information about the extraction result cache,
|
|
987
|
+
/// including size, file count, and age distribution.
|
|
988
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
989
|
+
pub struct CacheStats {
|
|
990
|
+
/// Total number of cached files
|
|
991
|
+
pub total_files: usize,
|
|
992
|
+
/// Total cache size in megabytes
|
|
993
|
+
pub total_size_mb: f64,
|
|
994
|
+
/// Available disk space in megabytes
|
|
995
|
+
pub available_space_mb: f64,
|
|
996
|
+
/// Age of the oldest cached file in days
|
|
997
|
+
pub oldest_file_age_days: f64,
|
|
998
|
+
/// Age of the newest cached file in days
|
|
999
|
+
pub newest_file_age_days: f64,
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
/// LibreOffice conversion result.
|
|
1003
|
+
///
|
|
1004
|
+
/// Result of converting a legacy office document (e.g., .doc, .ppt)
|
|
1005
|
+
/// to a modern format using LibreOffice.
|
|
1006
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1007
|
+
pub struct LibreOfficeConversionResult {
|
|
1008
|
+
/// Converted file bytes
|
|
1009
|
+
pub converted_bytes: Vec<u8>,
|
|
1010
|
+
/// Original format identifier
|
|
1011
|
+
pub original_format: String,
|
|
1012
|
+
/// Target format identifier
|
|
1013
|
+
pub target_format: String,
|
|
1014
|
+
/// Target MIME type after conversion
|
|
1015
|
+
pub target_mime: String,
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
#[cfg(test)]
|
|
1019
|
+
mod tests {
|
|
1020
|
+
use super::*;
|
|
1021
|
+
|
|
1022
|
+
#[test]
|
|
1023
|
+
fn test_metadata_serialization_with_format() {
|
|
1024
|
+
let mut metadata = Metadata {
|
|
1025
|
+
format: Some(FormatMetadata::Text(TextMetadata {
|
|
1026
|
+
line_count: 1,
|
|
1027
|
+
word_count: 2,
|
|
1028
|
+
character_count: 13,
|
|
1029
|
+
headers: None,
|
|
1030
|
+
links: None,
|
|
1031
|
+
code_blocks: None,
|
|
1032
|
+
})),
|
|
1033
|
+
..Default::default()
|
|
1034
|
+
};
|
|
1035
|
+
|
|
1036
|
+
metadata
|
|
1037
|
+
.additional
|
|
1038
|
+
.insert("quality_score".to_string(), serde_json::json!(1.0));
|
|
1039
|
+
|
|
1040
|
+
let json = serde_json::to_value(&metadata).unwrap();
|
|
1041
|
+
println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
|
|
1042
|
+
|
|
1043
|
+
assert!(
|
|
1044
|
+
json.get("format_type").is_some(),
|
|
1045
|
+
"format_type should be present in serialized JSON"
|
|
1046
|
+
);
|
|
1047
|
+
assert_eq!(json.get("format_type").unwrap(), "text");
|
|
1048
|
+
|
|
1049
|
+
assert_eq!(json.get("line_count").unwrap(), 1);
|
|
1050
|
+
assert_eq!(json.get("word_count").unwrap(), 2);
|
|
1051
|
+
assert_eq!(json.get("character_count").unwrap(), 13);
|
|
1052
|
+
|
|
1053
|
+
assert_eq!(json.get("quality_score").unwrap(), 1.0);
|
|
1054
|
+
}
|
|
1055
|
+
}
|