RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13 - Mend

kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (369) hide show

checksums.yaml +4 -4
data/.gitignore +14 -14
data/.rspec +3 -3
data/.rubocop.yaml +1 -1
data/.rubocop.yml +538 -538
data/Gemfile +8 -8
data/Gemfile.lock +2 -105
data/README.md +454 -454
data/Rakefile +25 -25
data/Steepfile +47 -47
data/examples/async_patterns.rb +341 -341
data/ext/kreuzberg_rb/extconf.rb +45 -45
data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
data/ext/kreuzberg_rb/native/README.md +425 -425
data/ext/kreuzberg_rb/native/build.rs +15 -15
data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
data/ext/kreuzberg_rb/native/include/strings.h +20 -20
data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
data/extconf.rb +28 -28
data/kreuzberg.gemspec +214 -214
data/lib/kreuzberg/api_proxy.rb +142 -142
data/lib/kreuzberg/cache_api.rb +81 -81
data/lib/kreuzberg/cli.rb +55 -55
data/lib/kreuzberg/cli_proxy.rb +127 -127
data/lib/kreuzberg/config.rb +724 -724
data/lib/kreuzberg/error_context.rb +80 -80
data/lib/kreuzberg/errors.rb +118 -118
data/lib/kreuzberg/extraction_api.rb +340 -340
data/lib/kreuzberg/mcp_proxy.rb +186 -186
data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
data/lib/kreuzberg/post_processor_protocol.rb +86 -86
data/lib/kreuzberg/result.rb +279 -279
data/lib/kreuzberg/setup_lib_path.rb +80 -80
data/lib/kreuzberg/validator_protocol.rb +89 -89
data/lib/kreuzberg/version.rb +5 -5
data/lib/kreuzberg.rb +109 -109
data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
data/sig/kreuzberg/internal.rbs +184 -184
data/sig/kreuzberg.rbs +546 -546
data/spec/binding/cache_spec.rb +227 -227
data/spec/binding/cli_proxy_spec.rb +85 -85
data/spec/binding/cli_spec.rb +55 -55
data/spec/binding/config_spec.rb +345 -345
data/spec/binding/config_validation_spec.rb +283 -283
data/spec/binding/error_handling_spec.rb +213 -213
data/spec/binding/errors_spec.rb +66 -66
data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
data/spec/binding/plugins/postprocessor_spec.rb +269 -269
data/spec/binding/plugins/validator_spec.rb +274 -274
data/spec/fixtures/config.toml +39 -39
data/spec/fixtures/config.yaml +41 -41
data/spec/fixtures/invalid_config.toml +4 -4
data/spec/smoke/package_spec.rb +178 -178
data/spec/spec_helper.rb +42 -42
data/vendor/Cargo.toml +2 -1
data/vendor/kreuzberg/Cargo.toml +2 -2
data/vendor/kreuzberg/README.md +230 -230
data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
data/vendor/kreuzberg/build.rs +843 -843
data/vendor/kreuzberg/src/api/error.rs +81 -81
data/vendor/kreuzberg/src/api/handlers.rs +199 -199
data/vendor/kreuzberg/src/api/mod.rs +79 -79
data/vendor/kreuzberg/src/api/server.rs +353 -353
data/vendor/kreuzberg/src/api/types.rs +170 -170
data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
data/vendor/kreuzberg/src/core/config.rs +1080 -1080
data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
data/vendor/kreuzberg/src/core/io.rs +329 -329
data/vendor/kreuzberg/src/core/mime.rs +605 -605
data/vendor/kreuzberg/src/core/mod.rs +47 -47
data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
data/vendor/kreuzberg/src/embeddings.rs +500 -500
data/vendor/kreuzberg/src/error.rs +431 -431
data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
data/vendor/kreuzberg/src/extraction/email.rs +854 -854
data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
data/vendor/kreuzberg/src/extraction/html.rs +601 -601
data/vendor/kreuzberg/src/extraction/image.rs +491 -491
data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
data/vendor/kreuzberg/src/extraction/table.rs +328 -328
data/vendor/kreuzberg/src/extraction/text.rs +269 -269
data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
data/vendor/kreuzberg/src/extractors/email.rs +157 -157
data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
data/vendor/kreuzberg/src/extractors/html.rs +407 -407
data/vendor/kreuzberg/src/extractors/image.rs +219 -219
data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
data/vendor/kreuzberg/src/extractors/security.rs +484 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
data/vendor/kreuzberg/src/extractors/text.rs +265 -265
data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
data/vendor/kreuzberg/src/image/dpi.rs +164 -164
data/vendor/kreuzberg/src/image/mod.rs +6 -6
data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
data/vendor/kreuzberg/src/image/resize.rs +89 -89
data/vendor/kreuzberg/src/keywords/config.rs +154 -154
data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
data/vendor/kreuzberg/src/keywords/types.rs +68 -68
data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
data/vendor/kreuzberg/src/lib.rs +113 -113
data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
data/vendor/kreuzberg/src/ocr/error.rs +37 -37
data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
data/vendor/kreuzberg/src/ocr/types.rs +393 -393
data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
data/vendor/kreuzberg/src/panic_context.rs +154 -154
data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
data/vendor/kreuzberg/src/pdf/error.rs +130 -130
data/vendor/kreuzberg/src/pdf/images.rs +139 -139
data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
data/vendor/kreuzberg/src/pdf/table.rs +420 -420
data/vendor/kreuzberg/src/pdf/text.rs +240 -240
data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
data/vendor/kreuzberg/src/text/mod.rs +25 -25
data/vendor/kreuzberg/src/text/quality.rs +697 -697
data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
data/vendor/kreuzberg/src/types.rs +1055 -1055
data/vendor/kreuzberg/src/utils/mod.rs +17 -17
data/vendor/kreuzberg/src/utils/quality.rs +959 -959
data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
data/vendor/kreuzberg/tests/api_tests.rs +966 -966
data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
data/vendor/kreuzberg/tests/config_features.rs +612 -612
data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
data/vendor/kreuzberg/tests/core_integration.rs +510 -510
data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
data/vendor/kreuzberg/tests/email_integration.rs +327 -327
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
data/vendor/kreuzberg/tests/error_handling.rs +402 -402
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
data/vendor/kreuzberg/tests/format_integration.rs +164 -164
data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
data/vendor/kreuzberg/tests/image_integration.rs +255 -255
data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
data/vendor/kreuzberg/tests/security_validation.rs +416 -416
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
data/vendor/kreuzberg-ffi/README.md +851 -851
data/vendor/kreuzberg-ffi/build.rs +176 -176
data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
data/vendor/kreuzberg-tesseract/LICENSE +22 -22
data/vendor/kreuzberg-tesseract/README.md +399 -399
data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
data/vendor/rb-sys/Cargo.lock +393 -393
data/vendor/rb-sys/Cargo.toml +70 -70
data/vendor/rb-sys/Cargo.toml.orig +57 -57
data/vendor/rb-sys/LICENSE-APACHE +190 -190
data/vendor/rb-sys/LICENSE-MIT +21 -21
data/vendor/rb-sys/build/features.rs +111 -111
data/vendor/rb-sys/build/main.rs +286 -286
data/vendor/rb-sys/build/stable_api_config.rs +155 -155
data/vendor/rb-sys/build/version.rs +50 -50
data/vendor/rb-sys/readme.md +36 -36
data/vendor/rb-sys/src/bindings.rs +21 -21
data/vendor/rb-sys/src/hidden.rs +11 -11
data/vendor/rb-sys/src/lib.rs +35 -35
data/vendor/rb-sys/src/macros.rs +371 -371
data/vendor/rb-sys/src/memory.rs +53 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
data/vendor/rb-sys/src/special_consts.rs +31 -31
data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
data/vendor/rb-sys/src/stable_api.rs +260 -260
data/vendor/rb-sys/src/symbol.rs +31 -31
data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
data/vendor/rb-sys/src/utils.rs +89 -89
data/vendor/rb-sys/src/value_type.rs +7 -7
metadata +7 -80

data/vendor/kreuzberg/src/types.rs CHANGED Viewed

@@ -1,1055 +1,1055 @@
-use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-#[cfg(feature = "pdf")]
-use crate::pdf::metadata::PdfMetadata;
-// ============================================================================
-// ============================================================================
-/// General extraction result used by the core extraction API.
-///
-/// This is the main result type returned by all extraction functions.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ExtractionResult {
-    pub content: String,
-    pub mime_type: String,
-    pub metadata: Metadata,
-    pub tables: Vec<Table>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub detected_languages: Option<Vec<String>>,
-    /// Text chunks when chunking is enabled.
-    ///
-    /// When chunking configuration is provided, the content is split into
-    /// overlapping chunks for efficient processing. Each chunk contains the text,
-    /// optional embeddings (if enabled), and metadata about its position.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub chunks: Option<Vec<Chunk>>,
-    /// Extracted images from the document.
-    ///
-    /// When image extraction is enabled via `ImageExtractionConfig`, this field
-    /// contains all images found in the document with their raw data and metadata.
-    /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub images: Option<Vec<ExtractedImage>>,
-    /// Per-page content when page extraction is enabled.
-    ///
-    /// When page extraction is configured, the document is split into per-page content
-    /// with tables and images mapped to their respective pages.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub pages: Option<Vec<PageContent>>,
-}
-/// Format-specific metadata (discriminated union).
-///
-/// Only one format type can exist per extraction result. This provides
-/// type-safe, clean metadata without nested optionals.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "format_type", rename_all = "snake_case")]
-pub enum FormatMetadata {
-    #[cfg(feature = "pdf")]
-    Pdf(PdfMetadata),
-    Excel(ExcelMetadata),
-    Email(EmailMetadata),
-    Pptx(PptxMetadata),
-    Archive(ArchiveMetadata),
-    Image(ImageMetadata),
-    Xml(XmlMetadata),
-    Text(TextMetadata),
-    Html(Box<HtmlMetadata>),
-    Ocr(OcrMetadata),
-}
-/// Extraction result metadata.
-///
-/// Contains common fields applicable to all formats, format-specific metadata
-/// via a discriminated union, and additional custom fields from postprocessors.
-#[derive(Debug, Clone, Serialize, Deserialize, Default)]
-pub struct Metadata {
-    /// Document title
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub title: Option<String>,
-    /// Document subject or description
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub subject: Option<String>,
-    /// Primary author(s) - always Vec for consistency
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub authors: Option<Vec<String>>,
-    /// Keywords/tags - always Vec for consistency
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub keywords: Option<Vec<String>>,
-    /// Primary language (ISO 639 code)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub language: Option<String>,
-    /// Creation timestamp (ISO 8601 format)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub created_at: Option<String>,
-    /// Last modification timestamp (ISO 8601 format)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub modified_at: Option<String>,
-    /// User who created the document
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub created_by: Option<String>,
-    /// User who last modified the document
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub modified_by: Option<String>,
-    /// Page/slide/sheet structure with boundaries
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub pages: Option<PageStructure>,
-    /// Document date (DEPRECATED - use created_at/modified_at instead)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub date: Option<String>,
-    /// Format-specific metadata (discriminated union)
-    ///
-    /// Contains detailed metadata specific to the document format.
-    /// Serializes with a `format_type` discriminator field.
-    #[serde(flatten, skip_serializing_if = "Option::is_none")]
-    pub format: Option<FormatMetadata>,
-    /// Image preprocessing metadata (when OCR preprocessing was applied)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub image_preprocessing: Option<ImagePreprocessingMetadata>,
-    /// JSON schema (for structured data extraction)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub json_schema: Option<serde_json::Value>,
-    /// Error metadata (for batch operations)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub error: Option<ErrorMetadata>,
-    /// Additional custom fields from postprocessors.
-    ///
-    /// This flattened HashMap allows Python/TypeScript postprocessors to add
-    /// arbitrary fields (entity extraction, keyword extraction, etc.).
-    /// Fields are merged at the root level during serialization.
-    #[serde(flatten)]
-    pub additional: HashMap<String, serde_json::Value>,
-}
-/// Unified page structure for documents.
-///
-/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
-/// with character offset boundaries for chunk-to-page mapping.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PageStructure {
-    /// Total number of pages/slides/sheets
-    pub total_count: usize,
-    /// Type of paginated unit
-    pub unit_type: PageUnitType,
-    /// Character offset boundaries for each page
-    ///
-    /// Maps character ranges in the extracted content to page numbers.
-    /// Used for chunk page range calculation.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub boundaries: Option<Vec<PageBoundary>>,
-    /// Detailed per-page metadata (optional, only when needed)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub pages: Option<Vec<PageInfo>>,
-}
-/// Type of paginated unit in a document.
-///
-/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(rename_all = "snake_case")]
-pub enum PageUnitType {
-    /// Standard document pages (PDF, DOCX, images)
-    Page,
-    /// Presentation slides (PPTX, ODP)
-    Slide,
-    /// Spreadsheet sheets (XLSX, ODS)
-    Sheet,
-}
-/// Byte offset boundary for a page.
-///
-/// Tracks where a specific page's content starts and ends in the main content string,
-/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
-/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PageBoundary {
-    /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
-    pub byte_start: usize,
-    /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
-    pub byte_end: usize,
-    /// Page number (1-indexed)
-    pub page_number: usize,
-}
-/// Metadata for individual page/slide/sheet.
-///
-/// Captures per-page information including dimensions, content counts,
-/// and visibility state (for presentations).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PageInfo {
-    /// Page number (1-indexed)
-    pub number: usize,
-    /// Page title (usually for presentations)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub title: Option<String>,
-    /// Dimensions in points (PDF) or pixels (images): (width, height)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub dimensions: Option<(f64, f64)>,
-    /// Number of images on this page
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub image_count: Option<usize>,
-    /// Number of tables on this page
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub table_count: Option<usize>,
-    /// Whether this page is hidden (e.g., in presentations)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub hidden: Option<bool>,
-}
-/// Content for a single page/slide.
-///
-/// When page extraction is enabled, documents are split into per-page content
-/// with associated tables and images mapped to each page.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PageContent {
-    /// Page number (1-indexed)
-    pub page_number: usize,
-    /// Text content for this page
-    pub content: String,
-    /// Tables found on this page
-    #[serde(skip_serializing_if = "Vec::is_empty", default)]
-    pub tables: Vec<Table>,
-    /// Images found on this page
-    #[serde(skip_serializing_if = "Vec::is_empty", default)]
-    pub images: Vec<ExtractedImage>,
-}
-/// Excel/spreadsheet metadata.
-///
-/// Contains information about sheets in Excel, LibreOffice Calc, and other
-/// spreadsheet formats (.xlsx, .xls, .ods, etc.).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ExcelMetadata {
-    /// Total number of sheets in the workbook
-    pub sheet_count: usize,
-    /// Names of all sheets in order
-    pub sheet_names: Vec<String>,
-}
-/// Email metadata extracted from .eml and .msg files.
-///
-/// Includes sender/recipient information, message ID, and attachment list.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct EmailMetadata {
-    /// Sender's email address
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub from_email: Option<String>,
-    /// Sender's display name
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub from_name: Option<String>,
-    /// Primary recipients
-    pub to_emails: Vec<String>,
-    /// CC recipients
-    pub cc_emails: Vec<String>,
-    /// BCC recipients
-    pub bcc_emails: Vec<String>,
-    /// Message-ID header value
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub message_id: Option<String>,
-    /// List of attachment filenames
-    pub attachments: Vec<String>,
-}
-/// Archive (ZIP/TAR/7Z) metadata.
-///
-/// Extracted from compressed archive files containing file lists and size information.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ArchiveMetadata {
-    /// Archive format ("ZIP", "TAR", "7Z", etc.)
-    pub format: String,
-    /// Total number of files in the archive
-    pub file_count: usize,
-    /// List of file paths within the archive
-    pub file_list: Vec<String>,
-    /// Total uncompressed size in bytes
-    pub total_size: usize,
-    /// Compressed size in bytes (if available)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub compressed_size: Option<usize>,
-}
-/// Image metadata extracted from image files.
-///
-/// Includes dimensions, format, and EXIF data.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ImageMetadata {
-    /// Image width in pixels
-    pub width: u32,
-    /// Image height in pixels
-    pub height: u32,
-    /// Image format (e.g., "PNG", "JPEG", "TIFF")
-    pub format: String,
-    /// EXIF metadata tags
-    pub exif: HashMap<String, String>,
-}
-/// XML metadata extracted during XML parsing.
-///
-/// Provides statistics about XML document structure.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct XmlMetadata {
-    /// Total number of XML elements processed
-    pub element_count: usize,
-    /// List of unique element tag names (sorted)
-    pub unique_elements: Vec<String>,
-}
-/// Text/Markdown metadata.
-///
-/// Extracted from plain text and Markdown files. Includes word counts and,
-/// for Markdown, structural elements like headers and links.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct TextMetadata {
-    /// Number of lines in the document
-    pub line_count: usize,
-    /// Number of words
-    pub word_count: usize,
-    /// Number of characters
-    pub character_count: usize,
-    /// Markdown headers (headings text only, for Markdown files)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub headers: Option<Vec<String>>,
-    /// Markdown links as (text, url) tuples (for Markdown files)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub links: Option<Vec<(String, String)>>,
-    /// Code blocks as (language, code) tuples (for Markdown files)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub code_blocks: Option<Vec<(String, String)>>,
-}
-/// HTML metadata extracted from HTML documents.
-///
-/// Includes meta tags, Open Graph data, Twitter Card metadata, and link relations.
-#[derive(Debug, Clone, Serialize, Deserialize, Default)]
-pub struct HtmlMetadata {
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub title: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub description: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub keywords: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub author: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub canonical: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub base_href: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub og_title: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub og_description: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub og_image: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub og_url: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub og_type: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub og_site_name: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub twitter_card: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub twitter_title: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub twitter_description: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub twitter_image: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub twitter_site: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub twitter_creator: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub link_author: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub link_license: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub link_alternate: Option<String>,
-}
-/// OCR processing metadata.
-///
-/// Captures information about OCR processing configuration and results.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct OcrMetadata {
-    /// OCR language code(s) used
-    pub language: String,
-    /// Tesseract Page Segmentation Mode (PSM)
-    pub psm: i32,
-    /// Output format (e.g., "text", "hocr")
-    pub output_format: String,
-    /// Number of tables detected
-    pub table_count: usize,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub table_rows: Option<usize>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub table_cols: Option<usize>,
-}
-/// Error metadata (for batch operations).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ErrorMetadata {
-    pub error_type: String,
-    pub message: String,
-}
-/// Extracted table structure.
-///
-/// Represents a table detected and extracted from a document (PDF, image, etc.).
-/// Tables are converted to both structured cell data and Markdown format.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Table {
-    /// Table cells as a 2D vector (rows × columns)
-    pub cells: Vec<Vec<String>>,
-    /// Markdown representation of the table
-    pub markdown: String,
-    /// Page number where the table was found (1-indexed)
-    pub page_number: usize,
-}
-/// A text chunk with optional embedding and metadata.
-///
-/// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
-/// contains the text content, optional embedding vector (if embedding generation
-/// is configured), and metadata about its position in the document.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Chunk {
-    /// The text content of this chunk.
-    pub content: String,
-    /// Optional embedding vector for this chunk.
-    ///
-    /// Only populated when `EmbeddingConfig` is provided in chunking configuration.
-    /// The dimensionality depends on the chosen embedding model.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub embedding: Option<Vec<f32>>,
-    /// Metadata about this chunk's position and properties.
-    pub metadata: ChunkMetadata,
-}
-/// Metadata about a chunk's position in the original document.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ChunkMetadata {
-    /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
-    pub byte_start: usize,
-    /// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
-    pub byte_end: usize,
-    /// Number of tokens in this chunk (if available).
-    ///
-    /// This is calculated by the embedding model's tokenizer if embeddings are enabled.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub token_count: Option<usize>,
-    /// Zero-based index of this chunk in the document.
-    pub chunk_index: usize,
-    /// Total number of chunks in the document.
-    pub total_chunks: usize,
-    /// First page number this chunk spans (1-indexed).
-    ///
-    /// Only populated when page tracking is enabled in extraction configuration.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub first_page: Option<usize>,
-    /// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
-    ///
-    /// Only populated when page tracking is enabled in extraction configuration.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub last_page: Option<usize>,
-}
-/// Extracted image from a document.
-///
-/// Contains raw image data, metadata, and optional nested OCR results.
-/// Raw bytes allow cross-language compatibility - users can convert to
-/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ExtractedImage {
-    /// Raw image data (PNG, JPEG, WebP, etc. bytes)
-    pub data: Vec<u8>,
-    /// Image format (e.g., "jpeg", "png", "webp")
-    pub format: String,
-    /// Zero-indexed position of this image in the document/page
-    pub image_index: usize,
-    /// Page/slide number where image was found (1-indexed)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub page_number: Option<usize>,
-    /// Image width in pixels
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub width: Option<u32>,
-    /// Image height in pixels
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub height: Option<u32>,
-    /// Colorspace information (e.g., "RGB", "CMYK", "Gray")
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub colorspace: Option<String>,
-    /// Bits per color component (e.g., 8, 16)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub bits_per_component: Option<u32>,
-    /// Whether this image is a mask image
-    #[serde(default)]
-    pub is_mask: bool,
-    /// Optional description of the image
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub description: Option<String>,
-    /// Nested OCR extraction result (if image was OCRed)
-    ///
-    /// When OCR is performed on this image, the result is embedded here
-    /// rather than in a separate collection, making the relationship explicit.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub ocr_result: Option<Box<ExtractionResult>>,
-}
-/// Excel workbook representation.
-///
-/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
-/// extracted content and metadata.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ExcelWorkbook {
-    /// All sheets in the workbook
-    pub sheets: Vec<ExcelSheet>,
-    /// Workbook-level metadata (author, creation date, etc.)
-    pub metadata: HashMap<String, String>,
-}
-/// Single Excel worksheet.
-///
-/// Represents one sheet from an Excel workbook with its content
-/// converted to Markdown format and dimensional statistics.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ExcelSheet {
-    /// Sheet name as it appears in Excel
-    pub name: String,
-    /// Sheet content converted to Markdown tables
-    pub markdown: String,
-    /// Number of rows
-    pub row_count: usize,
-    /// Number of columns
-    pub col_count: usize,
-    /// Total number of non-empty cells
-    pub cell_count: usize,
-}
-/// XML extraction result.
-///
-/// Contains extracted text content from XML files along with
-/// structural statistics about the XML document.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct XmlExtractionResult {
-    /// Extracted text content (XML structure filtered out)
-    pub content: String,
-    /// Total number of XML elements processed
-    pub element_count: usize,
-    /// List of unique element names found (sorted)
-    pub unique_elements: Vec<String>,
-}
-/// Plain text and Markdown extraction result.
-///
-/// Contains the extracted text along with statistics and,
-/// for Markdown files, structural elements like headers and links.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct TextExtractionResult {
-    /// Extracted text content
-    pub content: String,
-    /// Number of lines
-    pub line_count: usize,
-    /// Number of words
-    pub word_count: usize,
-    /// Number of characters
-    pub character_count: usize,
-    /// Markdown headers (text only, Markdown files only)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub headers: Option<Vec<String>>,
-    /// Markdown links as (text, URL) tuples (Markdown files only)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub links: Option<Vec<(String, String)>>,
-    /// Code blocks as (language, code) tuples (Markdown files only)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub code_blocks: Option<Vec<(String, String)>>,
-}
-/// PowerPoint (PPTX) extraction result.
-///
-/// Contains extracted slide content, metadata, and embedded images/tables.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PptxExtractionResult {
-    /// Extracted text content from all slides
-    pub content: String,
-    /// Presentation metadata
-    pub metadata: PptxMetadata,
-    /// Total number of slides
-    pub slide_count: usize,
-    /// Total number of embedded images
-    pub image_count: usize,
-    /// Total number of tables
-    pub table_count: usize,
-    /// Extracted images from the presentation
-    pub images: Vec<ExtractedImage>,
-    /// Slide structure with boundaries (when page tracking is enabled)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub page_structure: Option<PageStructure>,
-    /// Per-slide content (when page tracking is enabled)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub page_contents: Option<Vec<PageContent>>,
-}
-/// PowerPoint presentation metadata.
-///
-/// Contains PPTX-specific metadata. Common fields like title, author, and description
-/// are now in the base `Metadata` struct.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PptxMetadata {
-    /// List of fonts used in the presentation
-    #[serde(skip_serializing_if = "Vec::is_empty", default)]
-    pub fonts: Vec<String>,
-}
-/// Email extraction result.
-///
-/// Complete representation of an extracted email message (.eml or .msg)
-/// including headers, body content, and attachments.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct EmailExtractionResult {
-    /// Email subject line
-    pub subject: Option<String>,
-    /// Sender email address
-    pub from_email: Option<String>,
-    /// Primary recipient email addresses
-    pub to_emails: Vec<String>,
-    /// CC recipient email addresses
-    pub cc_emails: Vec<String>,
-    /// BCC recipient email addresses
-    pub bcc_emails: Vec<String>,
-    /// Email date/timestamp
-    pub date: Option<String>,
-    /// Message-ID header value
-    pub message_id: Option<String>,
-    /// Plain text version of the email body
-    pub plain_text: Option<String>,
-    /// HTML version of the email body
-    pub html_content: Option<String>,
-    /// Cleaned/processed text content
-    pub cleaned_text: String,
-    /// List of email attachments
-    pub attachments: Vec<EmailAttachment>,
-    /// Additional email headers and metadata
-    pub metadata: HashMap<String, String>,
-}
-/// Email attachment representation.
-///
-/// Contains metadata and optionally the content of an email attachment.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct EmailAttachment {
-    /// Attachment name (from Content-Disposition header)
-    pub name: Option<String>,
-    /// Filename of the attachment
-    pub filename: Option<String>,
-    /// MIME type of the attachment
-    pub mime_type: Option<String>,
-    /// Size in bytes
-    pub size: Option<usize>,
-    /// Whether this attachment is an image
-    pub is_image: bool,
-    /// Attachment data (if extracted)
-    pub data: Option<Vec<u8>>,
-}
-/// OCR extraction result.
-///
-/// Result of performing OCR on an image or scanned document,
-/// including recognized text and detected tables.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct OcrExtractionResult {
-    /// Recognized text content
-    pub content: String,
-    /// Original MIME type of the processed image
-    pub mime_type: String,
-    /// OCR processing metadata (confidence scores, language, etc.)
-    pub metadata: HashMap<String, serde_json::Value>,
-    /// Tables detected and extracted via OCR
-    pub tables: Vec<OcrTable>,
-}
-/// Table detected via OCR.
-///
-/// Represents a table structure recognized during OCR processing.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct OcrTable {
-    /// Table cells as a 2D vector (rows × columns)
-    pub cells: Vec<Vec<String>>,
-    /// Markdown representation of the table
-    pub markdown: String,
-    /// Page number where the table was found (1-indexed)
-    pub page_number: usize,
-}
-/// Image preprocessing configuration for OCR.
-///
-/// These settings control how images are preprocessed before OCR to improve
-/// text recognition quality. Different preprocessing strategies work better
-/// for different document types.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(default)]
-pub struct ImagePreprocessingConfig {
-    /// Target DPI for the image (300 is standard, 600 for small text).
-    pub target_dpi: i32,
-    /// Auto-detect and correct image rotation.
-    pub auto_rotate: bool,
-    /// Correct skew (tilted images).
-    pub deskew: bool,
-    /// Remove noise from the image.
-    pub denoise: bool,
-    /// Enhance contrast for better text visibility.
-    pub contrast_enhance: bool,
-    /// Binarization method: "otsu", "sauvola", "adaptive".
-    pub binarization_method: String,
-    /// Invert colors (white text on black → black on white).
-    pub invert_colors: bool,
-}
-impl Default for ImagePreprocessingConfig {
-    fn default() -> Self {
-        Self {
-            target_dpi: 300,
-            auto_rotate: true,
-            deskew: true,
-            denoise: false,
-            contrast_enhance: false,
-            binarization_method: "otsu".to_string(),
-            invert_colors: false,
-        }
-    }
-}
-/// Tesseract OCR configuration.
-///
-/// Provides fine-grained control over Tesseract OCR engine parameters.
-/// Most users can use the defaults, but these settings allow optimization
-/// for specific document types (invoices, handwriting, etc.).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(default)]
-pub struct TesseractConfig {
-    /// Language code (e.g., "eng", "deu", "fra")
-    pub language: String,
-    /// Page Segmentation Mode (0-13).
-    ///
-    /// Common values:
-    /// - 3: Fully automatic page segmentation (default)
-    /// - 6: Assume a single uniform block of text
-    /// - 11: Sparse text with no particular order
-    pub psm: i32,
-    /// Output format ("text" or "markdown")
-    pub output_format: String,
-    /// OCR Engine Mode (0-3).
-    ///
-    /// - 0: Legacy engine only
-    /// - 1: Neural nets (LSTM) only (usually best)
-    /// - 2: Legacy + LSTM
-    /// - 3: Default (based on what's available)
-    pub oem: i32,
-    /// Minimum confidence threshold (0.0-100.0).
-    ///
-    /// Words with confidence below this threshold may be rejected or flagged.
-    pub min_confidence: f64,
-    /// Image preprocessing configuration.
-    ///
-    /// Controls how images are preprocessed before OCR. Can significantly
-    /// improve quality for scanned documents or low-quality images.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub preprocessing: Option<ImagePreprocessingConfig>,
-    /// Enable automatic table detection and reconstruction
-    pub enable_table_detection: bool,
-    /// Minimum confidence threshold for table detection (0.0-1.0)
-    pub table_min_confidence: f64,
-    /// Column threshold for table detection (pixels)
-    pub table_column_threshold: i32,
-    /// Row threshold ratio for table detection (0.0-1.0)
-    pub table_row_threshold_ratio: f64,
-    /// Enable OCR result caching
-    pub use_cache: bool,
-    /// Use pre-adapted templates for character classification
-    pub classify_use_pre_adapted_templates: bool,
-    /// Enable N-gram language model
-    pub language_model_ngram_on: bool,
-    /// Don't reject good words during block-level processing
-    pub tessedit_dont_blkrej_good_wds: bool,
-    /// Don't reject good words during row-level processing
-    pub tessedit_dont_rowrej_good_wds: bool,
-    /// Enable dictionary correction
-    pub tessedit_enable_dict_correction: bool,
-    /// Whitelist of allowed characters (empty = all allowed)
-    pub tessedit_char_whitelist: String,
-    /// Blacklist of forbidden characters (empty = none forbidden)
-    pub tessedit_char_blacklist: String,
-    /// Use primary language params model
-    pub tessedit_use_primary_params_model: bool,
-    /// Variable-width space detection
-    pub textord_space_size_is_variable: bool,
-    /// Use adaptive thresholding method
-    pub thresholding_method: bool,
-}
-impl Default for TesseractConfig {
-    fn default() -> Self {
-        Self {
-            language: "eng".to_string(),
-            psm: 3,
-            output_format: "markdown".to_string(),
-            oem: 3,
-            min_confidence: 0.0,
-            preprocessing: None,
-            enable_table_detection: true,
-            table_min_confidence: 0.0,
-            table_column_threshold: 50,
-            table_row_threshold_ratio: 0.5,
-            use_cache: true,
-            classify_use_pre_adapted_templates: true,
-            language_model_ngram_on: false,
-            tessedit_dont_blkrej_good_wds: true,
-            tessedit_dont_rowrej_good_wds: true,
-            tessedit_enable_dict_correction: true,
-            tessedit_char_whitelist: String::new(),
-            tessedit_char_blacklist: String::new(),
-            tessedit_use_primary_params_model: true,
-            textord_space_size_is_variable: true,
-            thresholding_method: false,
-        }
-    }
-}
-/// Image preprocessing metadata.
-///
-/// Tracks the transformations applied to an image during OCR preprocessing,
-/// including DPI normalization, resizing, and resampling.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ImagePreprocessingMetadata {
-    /// Original image dimensions (width, height) in pixels
-    pub original_dimensions: (usize, usize),
-    /// Original image DPI (horizontal, vertical)
-    pub original_dpi: (f64, f64),
-    /// Target DPI from configuration
-    pub target_dpi: i32,
-    /// Scaling factor applied to the image
-    pub scale_factor: f64,
-    /// Whether DPI was auto-adjusted based on content
-    pub auto_adjusted: bool,
-    /// Final DPI after processing
-    pub final_dpi: i32,
-    /// New dimensions after resizing (if resized)
-    pub new_dimensions: Option<(usize, usize)>,
-    /// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
-    pub resample_method: String,
-    /// Whether dimensions were clamped to max_image_dimension
-    pub dimension_clamped: bool,
-    /// Calculated optimal DPI (if auto_adjust_dpi enabled)
-    pub calculated_dpi: Option<i32>,
-    /// Whether resize was skipped (dimensions already optimal)
-    pub skipped_resize: bool,
-    /// Error message if resize failed
-    pub resize_error: Option<String>,
-}
-/// Image extraction configuration (internal use).
-///
-/// **Note:** This is an internal type used for image preprocessing.
-/// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ExtractionConfig {
-    /// Target DPI for image normalization
-    pub target_dpi: i32,
-    /// Maximum image dimension (width or height)
-    pub max_image_dimension: i32,
-    /// Whether to auto-adjust DPI based on content
-    pub auto_adjust_dpi: bool,
-    /// Minimum DPI threshold
-    pub min_dpi: i32,
-    /// Maximum DPI threshold
-    pub max_dpi: i32,
-}
-impl Default for ExtractionConfig {
-    fn default() -> Self {
-        Self {
-            target_dpi: 300,
-            max_image_dimension: 4096,
-            auto_adjust_dpi: true,
-            min_dpi: 72,
-            max_dpi: 600,
-        }
-    }
-}
-/// Cache statistics.
-///
-/// Provides information about the extraction result cache,
-/// including size, file count, and age distribution.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct CacheStats {
-    /// Total number of cached files
-    pub total_files: usize,
-    /// Total cache size in megabytes
-    pub total_size_mb: f64,
-    /// Available disk space in megabytes
-    pub available_space_mb: f64,
-    /// Age of the oldest cached file in days
-    pub oldest_file_age_days: f64,
-    /// Age of the newest cached file in days
-    pub newest_file_age_days: f64,
-}
-/// LibreOffice conversion result.
-///
-/// Result of converting a legacy office document (e.g., .doc, .ppt)
-/// to a modern format using LibreOffice.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LibreOfficeConversionResult {
-    /// Converted file bytes
-    pub converted_bytes: Vec<u8>,
-    /// Original format identifier
-    pub original_format: String,
-    /// Target format identifier
-    pub target_format: String,
-    /// Target MIME type after conversion
-    pub target_mime: String,
-}
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_metadata_serialization_with_format() {
-        let mut metadata = Metadata {
-            format: Some(FormatMetadata::Text(TextMetadata {
-                line_count: 1,
-                word_count: 2,
-                character_count: 13,
-                headers: None,
-                links: None,
-                code_blocks: None,
-            })),
-            ..Default::default()
-        };
-        metadata
-            .additional
-            .insert("quality_score".to_string(), serde_json::json!(1.0));
-        let json = serde_json::to_value(&metadata).unwrap();
-        println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
-        assert!(
-            json.get("format_type").is_some(),
-            "format_type should be present in serialized JSON"
-        );
-        assert_eq!(json.get("format_type").unwrap(), "text");
-        assert_eq!(json.get("line_count").unwrap(), 1);
-        assert_eq!(json.get("word_count").unwrap(), 2);
-        assert_eq!(json.get("character_count").unwrap(), 13);
-        assert_eq!(json.get("quality_score").unwrap(), 1.0);
-    }
-}
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+#[cfg(feature = "pdf")]
+use crate::pdf::metadata::PdfMetadata;
+// ============================================================================
+// ============================================================================
+/// General extraction result used by the core extraction API.
+///
+/// This is the main result type returned by all extraction functions.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ExtractionResult {
+    pub content: String,
+    pub mime_type: String,
+    pub metadata: Metadata,
+    pub tables: Vec<Table>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub detected_languages: Option<Vec<String>>,
+    /// Text chunks when chunking is enabled.
+    ///
+    /// When chunking configuration is provided, the content is split into
+    /// overlapping chunks for efficient processing. Each chunk contains the text,
+    /// optional embeddings (if enabled), and metadata about its position.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub chunks: Option<Vec<Chunk>>,
+    /// Extracted images from the document.
+    ///
+    /// When image extraction is enabled via `ImageExtractionConfig`, this field
+    /// contains all images found in the document with their raw data and metadata.
+    /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub images: Option<Vec<ExtractedImage>>,
+    /// Per-page content when page extraction is enabled.
+    ///
+    /// When page extraction is configured, the document is split into per-page content
+    /// with tables and images mapped to their respective pages.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub pages: Option<Vec<PageContent>>,
+}
+/// Format-specific metadata (discriminated union).
+///
+/// Only one format type can exist per extraction result. This provides
+/// type-safe, clean metadata without nested optionals.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "format_type", rename_all = "snake_case")]
+pub enum FormatMetadata {
+    #[cfg(feature = "pdf")]
+    Pdf(PdfMetadata),
+    Excel(ExcelMetadata),
+    Email(EmailMetadata),
+    Pptx(PptxMetadata),
+    Archive(ArchiveMetadata),
+    Image(ImageMetadata),
+    Xml(XmlMetadata),
+    Text(TextMetadata),
+    Html(Box<HtmlMetadata>),
+    Ocr(OcrMetadata),
+}
+/// Extraction result metadata.
+///
+/// Contains common fields applicable to all formats, format-specific metadata
+/// via a discriminated union, and additional custom fields from postprocessors.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct Metadata {
+    /// Document title
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub title: Option<String>,
+    /// Document subject or description
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub subject: Option<String>,
+    /// Primary author(s) - always Vec for consistency
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub authors: Option<Vec<String>>,
+    /// Keywords/tags - always Vec for consistency
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub keywords: Option<Vec<String>>,
+    /// Primary language (ISO 639 code)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub language: Option<String>,
+    /// Creation timestamp (ISO 8601 format)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub created_at: Option<String>,
+    /// Last modification timestamp (ISO 8601 format)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub modified_at: Option<String>,
+    /// User who created the document
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub created_by: Option<String>,
+    /// User who last modified the document
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub modified_by: Option<String>,
+    /// Page/slide/sheet structure with boundaries
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub pages: Option<PageStructure>,
+    /// Document date (DEPRECATED - use created_at/modified_at instead)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub date: Option<String>,
+    /// Format-specific metadata (discriminated union)
+    ///
+    /// Contains detailed metadata specific to the document format.
+    /// Serializes with a `format_type` discriminator field.
+    #[serde(flatten, skip_serializing_if = "Option::is_none")]
+    pub format: Option<FormatMetadata>,
+    /// Image preprocessing metadata (when OCR preprocessing was applied)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_preprocessing: Option<ImagePreprocessingMetadata>,
+    /// JSON schema (for structured data extraction)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub json_schema: Option<serde_json::Value>,
+    /// Error metadata (for batch operations)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub error: Option<ErrorMetadata>,
+    /// Additional custom fields from postprocessors.
+    ///
+    /// This flattened HashMap allows Python/TypeScript postprocessors to add
+    /// arbitrary fields (entity extraction, keyword extraction, etc.).
+    /// Fields are merged at the root level during serialization.
+    #[serde(flatten)]
+    pub additional: HashMap<String, serde_json::Value>,
+}
+/// Unified page structure for documents.
+///
+/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
+/// with character offset boundaries for chunk-to-page mapping.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageStructure {
+    /// Total number of pages/slides/sheets
+    pub total_count: usize,
+    /// Type of paginated unit
+    pub unit_type: PageUnitType,
+    /// Character offset boundaries for each page
+    ///
+    /// Maps character ranges in the extracted content to page numbers.
+    /// Used for chunk page range calculation.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub boundaries: Option<Vec<PageBoundary>>,
+    /// Detailed per-page metadata (optional, only when needed)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub pages: Option<Vec<PageInfo>>,
+}
+/// Type of paginated unit in a document.
+///
+/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum PageUnitType {
+    /// Standard document pages (PDF, DOCX, images)
+    Page,
+    /// Presentation slides (PPTX, ODP)
+    Slide,
+    /// Spreadsheet sheets (XLSX, ODS)
+    Sheet,
+}
+/// Byte offset boundary for a page.
+///
+/// Tracks where a specific page's content starts and ends in the main content string,
+/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
+/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageBoundary {
+    /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
+    pub byte_start: usize,
+    /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
+    pub byte_end: usize,
+    /// Page number (1-indexed)
+    pub page_number: usize,
+}
+/// Metadata for individual page/slide/sheet.
+///
+/// Captures per-page information including dimensions, content counts,
+/// and visibility state (for presentations).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageInfo {
+    /// Page number (1-indexed)
+    pub number: usize,
+    /// Page title (usually for presentations)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub title: Option<String>,
+    /// Dimensions in points (PDF) or pixels (images): (width, height)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub dimensions: Option<(f64, f64)>,
+    /// Number of images on this page
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_count: Option<usize>,
+    /// Number of tables on this page
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub table_count: Option<usize>,
+    /// Whether this page is hidden (e.g., in presentations)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub hidden: Option<bool>,
+}
+/// Content for a single page/slide.
+///
+/// When page extraction is enabled, documents are split into per-page content
+/// with associated tables and images mapped to each page.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageContent {
+    /// Page number (1-indexed)
+    pub page_number: usize,
+    /// Text content for this page
+    pub content: String,
+    /// Tables found on this page
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    pub tables: Vec<Table>,
+    /// Images found on this page
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    pub images: Vec<ExtractedImage>,
+}
+/// Excel/spreadsheet metadata.
+///
+/// Contains information about sheets in Excel, LibreOffice Calc, and other
+/// spreadsheet formats (.xlsx, .xls, .ods, etc.).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ExcelMetadata {
+    /// Total number of sheets in the workbook
+    pub sheet_count: usize,
+    /// Names of all sheets in order
+    pub sheet_names: Vec<String>,
+}
+/// Email metadata extracted from .eml and .msg files.
+///
+/// Includes sender/recipient information, message ID, and attachment list.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EmailMetadata {
+    /// Sender's email address
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub from_email: Option<String>,
+    /// Sender's display name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub from_name: Option<String>,
+    /// Primary recipients
+    pub to_emails: Vec<String>,
+    /// CC recipients
+    pub cc_emails: Vec<String>,
+    /// BCC recipients
+    pub bcc_emails: Vec<String>,
+    /// Message-ID header value
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub message_id: Option<String>,
+    /// List of attachment filenames
+    pub attachments: Vec<String>,
+}
+/// Archive (ZIP/TAR/7Z) metadata.
+///
+/// Extracted from compressed archive files containing file lists and size information.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ArchiveMetadata {
+    /// Archive format ("ZIP", "TAR", "7Z", etc.)
+    pub format: String,
+    /// Total number of files in the archive
+    pub file_count: usize,
+    /// List of file paths within the archive
+    pub file_list: Vec<String>,
+    /// Total uncompressed size in bytes
+    pub total_size: usize,
+    /// Compressed size in bytes (if available)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub compressed_size: Option<usize>,
+}
+/// Image metadata extracted from image files.
+///
+/// Includes dimensions, format, and EXIF data.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ImageMetadata {
+    /// Image width in pixels
+    pub width: u32,
+    /// Image height in pixels
+    pub height: u32,
+    /// Image format (e.g., "PNG", "JPEG", "TIFF")
+    pub format: String,
+    /// EXIF metadata tags
+    pub exif: HashMap<String, String>,
+}
+/// XML metadata extracted during XML parsing.
+///
+/// Provides statistics about XML document structure.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct XmlMetadata {
+    /// Total number of XML elements processed
+    pub element_count: usize,
+    /// List of unique element tag names (sorted)
+    pub unique_elements: Vec<String>,
+}
+/// Text/Markdown metadata.
+///
+/// Extracted from plain text and Markdown files. Includes word counts and,
+/// for Markdown, structural elements like headers and links.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TextMetadata {
+    /// Number of lines in the document
+    pub line_count: usize,
+    /// Number of words
+    pub word_count: usize,
+    /// Number of characters
+    pub character_count: usize,
+    /// Markdown headers (headings text only, for Markdown files)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub headers: Option<Vec<String>>,
+    /// Markdown links as (text, url) tuples (for Markdown files)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<Vec<(String, String)>>,
+    /// Code blocks as (language, code) tuples (for Markdown files)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub code_blocks: Option<Vec<(String, String)>>,
+}
+/// HTML metadata extracted from HTML documents.
+///
+/// Includes meta tags, Open Graph data, Twitter Card metadata, and link relations.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct HtmlMetadata {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub title: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub keywords: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub author: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub canonical: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub base_href: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub og_title: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub og_description: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub og_image: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub og_url: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub og_type: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub og_site_name: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_card: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_title: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_description: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_image: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_site: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub twitter_creator: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub link_author: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub link_license: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub link_alternate: Option<String>,
+}
+/// OCR processing metadata.
+///
+/// Captures information about OCR processing configuration and results.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct OcrMetadata {
+    /// OCR language code(s) used
+    pub language: String,
+    /// Tesseract Page Segmentation Mode (PSM)
+    pub psm: i32,
+    /// Output format (e.g., "text", "hocr")
+    pub output_format: String,
+    /// Number of tables detected
+    pub table_count: usize,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub table_rows: Option<usize>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub table_cols: Option<usize>,
+}
+/// Error metadata (for batch operations).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ErrorMetadata {
+    pub error_type: String,
+    pub message: String,
+}
+/// Extracted table structure.
+///
+/// Represents a table detected and extracted from a document (PDF, image, etc.).
+/// Tables are converted to both structured cell data and Markdown format.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Table {
+    /// Table cells as a 2D vector (rows × columns)
+    pub cells: Vec<Vec<String>>,
+    /// Markdown representation of the table
+    pub markdown: String,
+    /// Page number where the table was found (1-indexed)
+    pub page_number: usize,
+}
+/// A text chunk with optional embedding and metadata.
+///
+/// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
+/// contains the text content, optional embedding vector (if embedding generation
+/// is configured), and metadata about its position in the document.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Chunk {
+    /// The text content of this chunk.
+    pub content: String,
+    /// Optional embedding vector for this chunk.
+    ///
+    /// Only populated when `EmbeddingConfig` is provided in chunking configuration.
+    /// The dimensionality depends on the chosen embedding model.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub embedding: Option<Vec<f32>>,
+    /// Metadata about this chunk's position and properties.
+    pub metadata: ChunkMetadata,
+}
+/// Metadata about a chunk's position in the original document.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChunkMetadata {
+    /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
+    pub byte_start: usize,
+    /// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
+    pub byte_end: usize,
+    /// Number of tokens in this chunk (if available).
+    ///
+    /// This is calculated by the embedding model's tokenizer if embeddings are enabled.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub token_count: Option<usize>,
+    /// Zero-based index of this chunk in the document.
+    pub chunk_index: usize,
+    /// Total number of chunks in the document.
+    pub total_chunks: usize,
+    /// First page number this chunk spans (1-indexed).
+    ///
+    /// Only populated when page tracking is enabled in extraction configuration.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub first_page: Option<usize>,
+    /// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
+    ///
+    /// Only populated when page tracking is enabled in extraction configuration.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub last_page: Option<usize>,
+}
+/// Extracted image from a document.
+///
+/// Contains raw image data, metadata, and optional nested OCR results.
+/// Raw bytes allow cross-language compatibility - users can convert to
+/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ExtractedImage {
+    /// Raw image data (PNG, JPEG, WebP, etc. bytes)
+    pub data: Vec<u8>,
+    /// Image format (e.g., "jpeg", "png", "webp")
+    pub format: String,
+    /// Zero-indexed position of this image in the document/page
+    pub image_index: usize,
+    /// Page/slide number where image was found (1-indexed)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub page_number: Option<usize>,
+    /// Image width in pixels
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub width: Option<u32>,
+    /// Image height in pixels
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub height: Option<u32>,
+    /// Colorspace information (e.g., "RGB", "CMYK", "Gray")
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub colorspace: Option<String>,
+    /// Bits per color component (e.g., 8, 16)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub bits_per_component: Option<u32>,
+    /// Whether this image is a mask image
+    #[serde(default)]
+    pub is_mask: bool,
+    /// Optional description of the image
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<String>,
+    /// Nested OCR extraction result (if image was OCRed)
+    ///
+    /// When OCR is performed on this image, the result is embedded here
+    /// rather than in a separate collection, making the relationship explicit.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub ocr_result: Option<Box<ExtractionResult>>,
+}
+/// Excel workbook representation.
+///
+/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
+/// extracted content and metadata.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ExcelWorkbook {
+    /// All sheets in the workbook
+    pub sheets: Vec<ExcelSheet>,
+    /// Workbook-level metadata (author, creation date, etc.)
+    pub metadata: HashMap<String, String>,
+}
+/// Single Excel worksheet.
+///
+/// Represents one sheet from an Excel workbook with its content
+/// converted to Markdown format and dimensional statistics.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ExcelSheet {
+    /// Sheet name as it appears in Excel
+    pub name: String,
+    /// Sheet content converted to Markdown tables
+    pub markdown: String,
+    /// Number of rows
+    pub row_count: usize,
+    /// Number of columns
+    pub col_count: usize,
+    /// Total number of non-empty cells
+    pub cell_count: usize,
+}
+/// XML extraction result.
+///
+/// Contains extracted text content from XML files along with
+/// structural statistics about the XML document.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct XmlExtractionResult {
+    /// Extracted text content (XML structure filtered out)
+    pub content: String,
+    /// Total number of XML elements processed
+    pub element_count: usize,
+    /// List of unique element names found (sorted)
+    pub unique_elements: Vec<String>,
+}
+/// Plain text and Markdown extraction result.
+///
+/// Contains the extracted text along with statistics and,
+/// for Markdown files, structural elements like headers and links.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TextExtractionResult {
+    /// Extracted text content
+    pub content: String,
+    /// Number of lines
+    pub line_count: usize,
+    /// Number of words
+    pub word_count: usize,
+    /// Number of characters
+    pub character_count: usize,
+    /// Markdown headers (text only, Markdown files only)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub headers: Option<Vec<String>>,
+    /// Markdown links as (text, URL) tuples (Markdown files only)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<Vec<(String, String)>>,
+    /// Code blocks as (language, code) tuples (Markdown files only)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub code_blocks: Option<Vec<(String, String)>>,
+}
+/// PowerPoint (PPTX) extraction result.
+///
+/// Contains extracted slide content, metadata, and embedded images/tables.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PptxExtractionResult {
+    /// Extracted text content from all slides
+    pub content: String,
+    /// Presentation metadata
+    pub metadata: PptxMetadata,
+    /// Total number of slides
+    pub slide_count: usize,
+    /// Total number of embedded images
+    pub image_count: usize,
+    /// Total number of tables
+    pub table_count: usize,
+    /// Extracted images from the presentation
+    pub images: Vec<ExtractedImage>,
+    /// Slide structure with boundaries (when page tracking is enabled)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub page_structure: Option<PageStructure>,
+    /// Per-slide content (when page tracking is enabled)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub page_contents: Option<Vec<PageContent>>,
+}
+/// PowerPoint presentation metadata.
+///
+/// Contains PPTX-specific metadata. Common fields like title, author, and description
+/// are now in the base `Metadata` struct.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PptxMetadata {
+    /// List of fonts used in the presentation
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    pub fonts: Vec<String>,
+}
+/// Email extraction result.
+///
+/// Complete representation of an extracted email message (.eml or .msg)
+/// including headers, body content, and attachments.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EmailExtractionResult {
+    /// Email subject line
+    pub subject: Option<String>,
+    /// Sender email address
+    pub from_email: Option<String>,
+    /// Primary recipient email addresses
+    pub to_emails: Vec<String>,
+    /// CC recipient email addresses
+    pub cc_emails: Vec<String>,
+    /// BCC recipient email addresses
+    pub bcc_emails: Vec<String>,
+    /// Email date/timestamp
+    pub date: Option<String>,
+    /// Message-ID header value
+    pub message_id: Option<String>,
+    /// Plain text version of the email body
+    pub plain_text: Option<String>,
+    /// HTML version of the email body
+    pub html_content: Option<String>,
+    /// Cleaned/processed text content
+    pub cleaned_text: String,
+    /// List of email attachments
+    pub attachments: Vec<EmailAttachment>,
+    /// Additional email headers and metadata
+    pub metadata: HashMap<String, String>,
+}
+/// Email attachment representation.
+///
+/// Contains metadata and optionally the content of an email attachment.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EmailAttachment {
+    /// Attachment name (from Content-Disposition header)
+    pub name: Option<String>,
+    /// Filename of the attachment
+    pub filename: Option<String>,
+    /// MIME type of the attachment
+    pub mime_type: Option<String>,
+    /// Size in bytes
+    pub size: Option<usize>,
+    /// Whether this attachment is an image
+    pub is_image: bool,
+    /// Attachment data (if extracted)
+    pub data: Option<Vec<u8>>,
+}
+/// OCR extraction result.
+///
+/// Result of performing OCR on an image or scanned document,
+/// including recognized text and detected tables.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct OcrExtractionResult {
+    /// Recognized text content
+    pub content: String,
+    /// Original MIME type of the processed image
+    pub mime_type: String,
+    /// OCR processing metadata (confidence scores, language, etc.)
+    pub metadata: HashMap<String, serde_json::Value>,
+    /// Tables detected and extracted via OCR
+    pub tables: Vec<OcrTable>,
+}
+/// Table detected via OCR.
+///
+/// Represents a table structure recognized during OCR processing.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct OcrTable {
+    /// Table cells as a 2D vector (rows × columns)
+    pub cells: Vec<Vec<String>>,
+    /// Markdown representation of the table
+    pub markdown: String,
+    /// Page number where the table was found (1-indexed)
+    pub page_number: usize,
+}
+/// Image preprocessing configuration for OCR.
+///
+/// These settings control how images are preprocessed before OCR to improve
+/// text recognition quality. Different preprocessing strategies work better
+/// for different document types.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(default)]
+pub struct ImagePreprocessingConfig {
+    /// Target DPI for the image (300 is standard, 600 for small text).
+    pub target_dpi: i32,
+    /// Auto-detect and correct image rotation.
+    pub auto_rotate: bool,
+    /// Correct skew (tilted images).
+    pub deskew: bool,
+    /// Remove noise from the image.
+    pub denoise: bool,
+    /// Enhance contrast for better text visibility.
+    pub contrast_enhance: bool,
+    /// Binarization method: "otsu", "sauvola", "adaptive".
+    pub binarization_method: String,
+    /// Invert colors (white text on black → black on white).
+    pub invert_colors: bool,
+}
+impl Default for ImagePreprocessingConfig {
+    fn default() -> Self {
+        Self {
+            target_dpi: 300,
+            auto_rotate: true,
+            deskew: true,
+            denoise: false,
+            contrast_enhance: false,
+            binarization_method: "otsu".to_string(),
+            invert_colors: false,
+        }
+    }
+}
+/// Tesseract OCR configuration.
+///
+/// Provides fine-grained control over Tesseract OCR engine parameters.
+/// Most users can use the defaults, but these settings allow optimization
+/// for specific document types (invoices, handwriting, etc.).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(default)]
+pub struct TesseractConfig {
+    /// Language code (e.g., "eng", "deu", "fra")
+    pub language: String,
+    /// Page Segmentation Mode (0-13).
+    ///
+    /// Common values:
+    /// - 3: Fully automatic page segmentation (default)
+    /// - 6: Assume a single uniform block of text
+    /// - 11: Sparse text with no particular order
+    pub psm: i32,
+    /// Output format ("text" or "markdown")
+    pub output_format: String,
+    /// OCR Engine Mode (0-3).
+    ///
+    /// - 0: Legacy engine only
+    /// - 1: Neural nets (LSTM) only (usually best)
+    /// - 2: Legacy + LSTM
+    /// - 3: Default (based on what's available)
+    pub oem: i32,
+    /// Minimum confidence threshold (0.0-100.0).
+    ///
+    /// Words with confidence below this threshold may be rejected or flagged.
+    pub min_confidence: f64,
+    /// Image preprocessing configuration.
+    ///
+    /// Controls how images are preprocessed before OCR. Can significantly
+    /// improve quality for scanned documents or low-quality images.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub preprocessing: Option<ImagePreprocessingConfig>,
+    /// Enable automatic table detection and reconstruction
+    pub enable_table_detection: bool,
+    /// Minimum confidence threshold for table detection (0.0-1.0)
+    pub table_min_confidence: f64,
+    /// Column threshold for table detection (pixels)
+    pub table_column_threshold: i32,
+    /// Row threshold ratio for table detection (0.0-1.0)
+    pub table_row_threshold_ratio: f64,
+    /// Enable OCR result caching
+    pub use_cache: bool,
+    /// Use pre-adapted templates for character classification
+    pub classify_use_pre_adapted_templates: bool,
+    /// Enable N-gram language model
+    pub language_model_ngram_on: bool,
+    /// Don't reject good words during block-level processing
+    pub tessedit_dont_blkrej_good_wds: bool,
+    /// Don't reject good words during row-level processing
+    pub tessedit_dont_rowrej_good_wds: bool,
+    /// Enable dictionary correction
+    pub tessedit_enable_dict_correction: bool,
+    /// Whitelist of allowed characters (empty = all allowed)
+    pub tessedit_char_whitelist: String,
+    /// Blacklist of forbidden characters (empty = none forbidden)
+    pub tessedit_char_blacklist: String,
+    /// Use primary language params model
+    pub tessedit_use_primary_params_model: bool,
+    /// Variable-width space detection
+    pub textord_space_size_is_variable: bool,
+    /// Use adaptive thresholding method
+    pub thresholding_method: bool,
+}
+impl Default for TesseractConfig {
+    fn default() -> Self {
+        Self {
+            language: "eng".to_string(),
+            psm: 3,
+            output_format: "markdown".to_string(),
+            oem: 3,
+            min_confidence: 0.0,
+            preprocessing: None,
+            enable_table_detection: true,
+            table_min_confidence: 0.0,
+            table_column_threshold: 50,
+            table_row_threshold_ratio: 0.5,
+            use_cache: true,
+            classify_use_pre_adapted_templates: true,
+            language_model_ngram_on: false,
+            tessedit_dont_blkrej_good_wds: true,
+            tessedit_dont_rowrej_good_wds: true,
+            tessedit_enable_dict_correction: true,
+            tessedit_char_whitelist: String::new(),
+            tessedit_char_blacklist: String::new(),
+            tessedit_use_primary_params_model: true,
+            textord_space_size_is_variable: true,
+            thresholding_method: false,
+        }
+    }
+}
+/// Image preprocessing metadata.
+///
+/// Tracks the transformations applied to an image during OCR preprocessing,
+/// including DPI normalization, resizing, and resampling.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ImagePreprocessingMetadata {
+    /// Original image dimensions (width, height) in pixels
+    pub original_dimensions: (usize, usize),
+    /// Original image DPI (horizontal, vertical)
+    pub original_dpi: (f64, f64),
+    /// Target DPI from configuration
+    pub target_dpi: i32,
+    /// Scaling factor applied to the image
+    pub scale_factor: f64,
+    /// Whether DPI was auto-adjusted based on content
+    pub auto_adjusted: bool,
+    /// Final DPI after processing
+    pub final_dpi: i32,
+    /// New dimensions after resizing (if resized)
+    pub new_dimensions: Option<(usize, usize)>,
+    /// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
+    pub resample_method: String,
+    /// Whether dimensions were clamped to max_image_dimension
+    pub dimension_clamped: bool,
+    /// Calculated optimal DPI (if auto_adjust_dpi enabled)
+    pub calculated_dpi: Option<i32>,
+    /// Whether resize was skipped (dimensions already optimal)
+    pub skipped_resize: bool,
+    /// Error message if resize failed
+    pub resize_error: Option<String>,
+}
+/// Image extraction configuration (internal use).
+///
+/// **Note:** This is an internal type used for image preprocessing.
+/// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ExtractionConfig {
+    /// Target DPI for image normalization
+    pub target_dpi: i32,
+    /// Maximum image dimension (width or height)
+    pub max_image_dimension: i32,
+    /// Whether to auto-adjust DPI based on content
+    pub auto_adjust_dpi: bool,
+    /// Minimum DPI threshold
+    pub min_dpi: i32,
+    /// Maximum DPI threshold
+    pub max_dpi: i32,
+}
+impl Default for ExtractionConfig {
+    fn default() -> Self {
+        Self {
+            target_dpi: 300,
+            max_image_dimension: 4096,
+            auto_adjust_dpi: true,
+            min_dpi: 72,
+            max_dpi: 600,
+        }
+    }
+}
+/// Cache statistics.
+///
+/// Provides information about the extraction result cache,
+/// including size, file count, and age distribution.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CacheStats {
+    /// Total number of cached files
+    pub total_files: usize,
+    /// Total cache size in megabytes
+    pub total_size_mb: f64,
+    /// Available disk space in megabytes
+    pub available_space_mb: f64,
+    /// Age of the oldest cached file in days
+    pub oldest_file_age_days: f64,
+    /// Age of the newest cached file in days
+    pub newest_file_age_days: f64,
+}
+/// LibreOffice conversion result.
+///
+/// Result of converting a legacy office document (e.g., .doc, .ppt)
+/// to a modern format using LibreOffice.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LibreOfficeConversionResult {
+    /// Converted file bytes
+    pub converted_bytes: Vec<u8>,
+    /// Original format identifier
+    pub original_format: String,
+    /// Target format identifier
+    pub target_format: String,
+    /// Target MIME type after conversion
+    pub target_mime: String,
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_metadata_serialization_with_format() {
+        let mut metadata = Metadata {
+            format: Some(FormatMetadata::Text(TextMetadata {
+                line_count: 1,
+                word_count: 2,
+                character_count: 13,
+                headers: None,
+                links: None,
+                code_blocks: None,
+            })),
+            ..Default::default()
+        };
+        metadata
+            .additional
+            .insert("quality_score".to_string(), serde_json::json!(1.0));
+        let json = serde_json::to_value(&metadata).unwrap();
+        println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
+        assert!(
+            json.get("format_type").is_some(),
+            "format_type should be present in serialized JSON"
+        );
+        assert_eq!(json.get("format_type").unwrap(), "text");
+        assert_eq!(json.get("line_count").unwrap(), 1);
+        assert_eq!(json.get("word_count").unwrap(), 2);
+        assert_eq!(json.get("character_count").unwrap(), 13);
+        assert_eq!(json.get("quality_score").unwrap(), 1.0);
+    }
+}