kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +105 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +73 -4
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
|
@@ -1,284 +1,284 @@
|
|
|
1
|
-
//! ODT (OpenDocument) metadata extraction from meta.xml
|
|
2
|
-
//!
|
|
3
|
-
//! Extracts metadata from OpenDocument Text files following the OASIS OpenDocument standard.
|
|
4
|
-
|
|
5
|
-
use crate::error::{KreuzbergError, Result};
|
|
6
|
-
use std::io::Read;
|
|
7
|
-
use zip::ZipArchive;
|
|
8
|
-
|
|
9
|
-
/// OpenDocument metadata from meta.xml
|
|
10
|
-
///
|
|
11
|
-
/// Contains metadata fields defined by the OASIS OpenDocument Format standard.
|
|
12
|
-
/// Uses Dublin Core elements (dc:) and OpenDocument meta elements (meta:).
|
|
13
|
-
#[derive(Debug, Clone, Default, PartialEq)]
|
|
14
|
-
pub struct OdtProperties {
|
|
15
|
-
/// Document title (dc:title)
|
|
16
|
-
pub title: Option<String>,
|
|
17
|
-
/// Document subject/topic (dc:subject)
|
|
18
|
-
pub subject: Option<String>,
|
|
19
|
-
/// Current document creator/author (dc:creator)
|
|
20
|
-
pub creator: Option<String>,
|
|
21
|
-
/// Initial creator of the document (meta:initial-creator)
|
|
22
|
-
pub initial_creator: Option<String>,
|
|
23
|
-
/// Keywords or tags (meta:keyword)
|
|
24
|
-
pub keywords: Option<String>,
|
|
25
|
-
/// Document description (dc:description)
|
|
26
|
-
pub description: Option<String>,
|
|
27
|
-
/// Current modification date (dc:date)
|
|
28
|
-
pub date: Option<String>,
|
|
29
|
-
/// Initial creation date (meta:creation-date)
|
|
30
|
-
pub creation_date: Option<String>,
|
|
31
|
-
/// Document language (dc:language)
|
|
32
|
-
pub language: Option<String>,
|
|
33
|
-
/// Generator/application that created the document (meta:generator)
|
|
34
|
-
pub generator: Option<String>,
|
|
35
|
-
/// Editing duration in ISO 8601 format (meta:editing-duration)
|
|
36
|
-
pub editing_duration: Option<String>,
|
|
37
|
-
/// Number of edits/revisions (meta:editing-cycles)
|
|
38
|
-
pub editing_cycles: Option<String>,
|
|
39
|
-
/// Document statistics - page count (meta:page-count)
|
|
40
|
-
pub page_count: Option<i32>,
|
|
41
|
-
/// Document statistics - word count (meta:word-count)
|
|
42
|
-
pub word_count: Option<i32>,
|
|
43
|
-
/// Document statistics - character count (meta:character-count)
|
|
44
|
-
pub character_count: Option<i32>,
|
|
45
|
-
/// Document statistics - paragraph count (meta:paragraph-count)
|
|
46
|
-
pub paragraph_count: Option<i32>,
|
|
47
|
-
/// Document statistics - table count (meta:table-count)
|
|
48
|
-
pub table_count: Option<i32>,
|
|
49
|
-
/// Document statistics - image count (meta:image-count)
|
|
50
|
-
pub image_count: Option<i32>,
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
/// Extract ODT metadata from an OpenDocument file
|
|
54
|
-
///
|
|
55
|
-
/// Parses `meta.xml` from the ZIP archive and extracts OpenDocument metadata.
|
|
56
|
-
///
|
|
57
|
-
/// # Arguments
|
|
58
|
-
///
|
|
59
|
-
/// * `archive` - ZIP archive containing the OpenDocument file
|
|
60
|
-
///
|
|
61
|
-
/// # Returns
|
|
62
|
-
///
|
|
63
|
-
/// Returns `OdtProperties` with extracted metadata. Fields that are not present
|
|
64
|
-
/// in the document will be `None`.
|
|
65
|
-
///
|
|
66
|
-
/// # Errors
|
|
67
|
-
///
|
|
68
|
-
/// Returns an error if:
|
|
69
|
-
/// - The ZIP archive cannot be read
|
|
70
|
-
/// - The meta.xml file is malformed
|
|
71
|
-
/// - XML parsing fails
|
|
72
|
-
///
|
|
73
|
-
/// # Example
|
|
74
|
-
///
|
|
75
|
-
/// ```no_run
|
|
76
|
-
/// use kreuzberg::extraction::office_metadata::extract_odt_properties;
|
|
77
|
-
/// use std::fs::File;
|
|
78
|
-
/// use zip::ZipArchive;
|
|
79
|
-
///
|
|
80
|
-
/// let file = File::open("document.odt")?;
|
|
81
|
-
/// let mut archive = ZipArchive::new(file)?;
|
|
82
|
-
/// let props = extract_odt_properties(&mut archive)?;
|
|
83
|
-
///
|
|
84
|
-
/// println!("Title: {:?}", props.title);
|
|
85
|
-
/// println!("Creator: {:?}", props.creator);
|
|
86
|
-
/// println!("Created: {:?}", props.creation_date);
|
|
87
|
-
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
|
88
|
-
/// ```
|
|
89
|
-
pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<R>) -> Result<OdtProperties> {
|
|
90
|
-
let mut xml_content = String::new();
|
|
91
|
-
|
|
92
|
-
match archive.by_name("meta.xml") {
|
|
93
|
-
Ok(mut file) => {
|
|
94
|
-
file.read_to_string(&mut xml_content)
|
|
95
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to read meta.xml: {}", e)))?;
|
|
96
|
-
}
|
|
97
|
-
Err(_) => {
|
|
98
|
-
return Ok(OdtProperties::default());
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
let doc = roxmltree::Document::parse(&xml_content)
|
|
103
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse meta.xml: {}", e)))?;
|
|
104
|
-
|
|
105
|
-
let root = doc.root_element();
|
|
106
|
-
|
|
107
|
-
let title = super::parse_xml_text(root, "title");
|
|
108
|
-
let subject = super::parse_xml_text(root, "subject");
|
|
109
|
-
let creator = super::parse_xml_text(root, "creator");
|
|
110
|
-
let description = super::parse_xml_text(root, "description");
|
|
111
|
-
let language = super::parse_xml_text(root, "language");
|
|
112
|
-
let date = super::parse_xml_text(root, "date");
|
|
113
|
-
|
|
114
|
-
let initial_creator = super::parse_xml_text(root, "initial-creator");
|
|
115
|
-
let keywords = super::parse_xml_text(root, "keyword");
|
|
116
|
-
let creation_date = super::parse_xml_text(root, "creation-date");
|
|
117
|
-
let generator = super::parse_xml_text(root, "generator");
|
|
118
|
-
let editing_duration = super::parse_xml_text(root, "editing-duration");
|
|
119
|
-
let editing_cycles = super::parse_xml_text(root, "editing-cycles");
|
|
120
|
-
|
|
121
|
-
let page_count = super::parse_xml_int(root, "page-count");
|
|
122
|
-
let word_count = super::parse_xml_int(root, "word-count");
|
|
123
|
-
let character_count = super::parse_xml_int(root, "character-count");
|
|
124
|
-
let paragraph_count = super::parse_xml_int(root, "paragraph-count");
|
|
125
|
-
let table_count = super::parse_xml_int(root, "table-count");
|
|
126
|
-
let image_count = super::parse_xml_int(root, "image-count");
|
|
127
|
-
|
|
128
|
-
Ok(OdtProperties {
|
|
129
|
-
title,
|
|
130
|
-
subject,
|
|
131
|
-
creator,
|
|
132
|
-
initial_creator,
|
|
133
|
-
keywords,
|
|
134
|
-
description,
|
|
135
|
-
date,
|
|
136
|
-
creation_date,
|
|
137
|
-
language,
|
|
138
|
-
generator,
|
|
139
|
-
editing_duration,
|
|
140
|
-
editing_cycles,
|
|
141
|
-
page_count,
|
|
142
|
-
word_count,
|
|
143
|
-
character_count,
|
|
144
|
-
paragraph_count,
|
|
145
|
-
table_count,
|
|
146
|
-
image_count,
|
|
147
|
-
})
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
#[cfg(test)]
|
|
151
|
-
mod tests {
|
|
152
|
-
use super::*;
|
|
153
|
-
use std::io::{Cursor, Write};
|
|
154
|
-
|
|
155
|
-
fn create_test_zip_with_meta_xml(meta_xml: &str) -> ZipArchive<Cursor<Vec<u8>>> {
|
|
156
|
-
let buffer = Vec::new();
|
|
157
|
-
let cursor = Cursor::new(buffer);
|
|
158
|
-
let mut zip = zip::ZipWriter::new(cursor);
|
|
159
|
-
|
|
160
|
-
let options = zip::write::FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
|
|
161
|
-
|
|
162
|
-
zip.start_file("meta.xml", options).unwrap();
|
|
163
|
-
zip.write_all(meta_xml.as_bytes()).unwrap();
|
|
164
|
-
|
|
165
|
-
let cursor = zip.finish().unwrap();
|
|
166
|
-
ZipArchive::new(cursor).unwrap()
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
#[test]
|
|
170
|
-
fn test_extract_odt_properties_full() {
|
|
171
|
-
let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
172
|
-
<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
|
173
|
-
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
174
|
-
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
|
175
|
-
office:version="1.3">
|
|
176
|
-
<office:meta>
|
|
177
|
-
<dc:title>Test Document</dc:title>
|
|
178
|
-
<dc:subject>Testing</dc:subject>
|
|
179
|
-
<dc:creator>John Doe</dc:creator>
|
|
180
|
-
<meta:initial-creator>Jane Smith</meta:initial-creator>
|
|
181
|
-
<dc:description>A test document for ODT metadata</dc:description>
|
|
182
|
-
<meta:keyword>test, metadata, odt</meta:keyword>
|
|
183
|
-
<dc:language>en-US</dc:language>
|
|
184
|
-
<meta:creation-date>2024-01-01T10:00:00Z</meta:creation-date>
|
|
185
|
-
<dc:date>2024-01-02T15:30:00Z</dc:date>
|
|
186
|
-
<meta:generator>LibreOffice/24.2</meta:generator>
|
|
187
|
-
<meta:editing-duration>PT2H30M</meta:editing-duration>
|
|
188
|
-
<meta:editing-cycles>5</meta:editing-cycles>
|
|
189
|
-
<meta:page-count>10</meta:page-count>
|
|
190
|
-
<meta:word-count>1500</meta:word-count>
|
|
191
|
-
<meta:character-count>9000</meta:character-count>
|
|
192
|
-
<meta:paragraph-count>45</meta:paragraph-count>
|
|
193
|
-
<meta:table-count>3</meta:table-count>
|
|
194
|
-
<meta:image-count>7</meta:image-count>
|
|
195
|
-
</office:meta>
|
|
196
|
-
</office:document-meta>"#;
|
|
197
|
-
|
|
198
|
-
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
199
|
-
let props = extract_odt_properties(&mut archive).unwrap();
|
|
200
|
-
|
|
201
|
-
assert_eq!(props.title, Some("Test Document".to_string()));
|
|
202
|
-
assert_eq!(props.subject, Some("Testing".to_string()));
|
|
203
|
-
assert_eq!(props.creator, Some("John Doe".to_string()));
|
|
204
|
-
assert_eq!(props.initial_creator, Some("Jane Smith".to_string()));
|
|
205
|
-
assert_eq!(props.keywords, Some("test, metadata, odt".to_string()));
|
|
206
|
-
assert_eq!(props.description, Some("A test document for ODT metadata".to_string()));
|
|
207
|
-
assert_eq!(props.language, Some("en-US".to_string()));
|
|
208
|
-
assert_eq!(props.creation_date, Some("2024-01-01T10:00:00Z".to_string()));
|
|
209
|
-
assert_eq!(props.date, Some("2024-01-02T15:30:00Z".to_string()));
|
|
210
|
-
assert_eq!(props.generator, Some("LibreOffice/24.2".to_string()));
|
|
211
|
-
assert_eq!(props.editing_duration, Some("PT2H30M".to_string()));
|
|
212
|
-
assert_eq!(props.editing_cycles, Some("5".to_string()));
|
|
213
|
-
assert_eq!(props.page_count, Some(10));
|
|
214
|
-
assert_eq!(props.word_count, Some(1500));
|
|
215
|
-
assert_eq!(props.character_count, Some(9000));
|
|
216
|
-
assert_eq!(props.paragraph_count, Some(45));
|
|
217
|
-
assert_eq!(props.table_count, Some(3));
|
|
218
|
-
assert_eq!(props.image_count, Some(7));
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
#[test]
|
|
222
|
-
fn test_extract_odt_properties_minimal() {
|
|
223
|
-
let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
224
|
-
<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
|
225
|
-
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
226
|
-
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
|
227
|
-
office:version="1.3">
|
|
228
|
-
<office:meta>
|
|
229
|
-
<dc:creator>Alice</dc:creator>
|
|
230
|
-
<meta:creation-date>2024-01-01T10:00:00Z</meta:creation-date>
|
|
231
|
-
</office:meta>
|
|
232
|
-
</office:document-meta>"#;
|
|
233
|
-
|
|
234
|
-
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
235
|
-
let props = extract_odt_properties(&mut archive).unwrap();
|
|
236
|
-
|
|
237
|
-
assert_eq!(props.creator, Some("Alice".to_string()));
|
|
238
|
-
assert_eq!(props.creation_date, Some("2024-01-01T10:00:00Z".to_string()));
|
|
239
|
-
assert_eq!(props.title, None);
|
|
240
|
-
assert_eq!(props.keywords, None);
|
|
241
|
-
assert_eq!(props.word_count, None);
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
#[test]
|
|
245
|
-
fn test_extract_odt_properties_empty_elements() {
|
|
246
|
-
let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
247
|
-
<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
|
248
|
-
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
249
|
-
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
|
250
|
-
office:version="1.3">
|
|
251
|
-
<office:meta>
|
|
252
|
-
<dc:title></dc:title>
|
|
253
|
-
<dc:creator>Bob</dc:creator>
|
|
254
|
-
</office:meta>
|
|
255
|
-
</office:document-meta>"#;
|
|
256
|
-
|
|
257
|
-
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
258
|
-
let props = extract_odt_properties(&mut archive).unwrap();
|
|
259
|
-
|
|
260
|
-
assert_eq!(props.title, None);
|
|
261
|
-
assert_eq!(props.creator, Some("Bob".to_string()));
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
#[test]
|
|
265
|
-
fn test_extract_odt_properties_missing_file() {
|
|
266
|
-
let buffer = Vec::new();
|
|
267
|
-
let cursor = Cursor::new(buffer);
|
|
268
|
-
let zip = zip::ZipWriter::new(cursor);
|
|
269
|
-
let cursor = zip.finish().unwrap();
|
|
270
|
-
let mut archive = ZipArchive::new(cursor).unwrap();
|
|
271
|
-
|
|
272
|
-
let props = extract_odt_properties(&mut archive).unwrap();
|
|
273
|
-
assert_eq!(props, OdtProperties::default());
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
#[test]
|
|
277
|
-
fn test_extract_odt_properties_malformed_xml() {
|
|
278
|
-
let meta_xml = "not valid xml <";
|
|
279
|
-
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
280
|
-
|
|
281
|
-
let result = extract_odt_properties(&mut archive);
|
|
282
|
-
assert!(result.is_err());
|
|
283
|
-
}
|
|
284
|
-
}
|
|
1
|
+
//! ODT (OpenDocument) metadata extraction from meta.xml
|
|
2
|
+
//!
|
|
3
|
+
//! Extracts metadata from OpenDocument Text files following the OASIS OpenDocument standard.
|
|
4
|
+
|
|
5
|
+
use crate::error::{KreuzbergError, Result};
|
|
6
|
+
use std::io::Read;
|
|
7
|
+
use zip::ZipArchive;
|
|
8
|
+
|
|
9
|
+
/// OpenDocument metadata from meta.xml
|
|
10
|
+
///
|
|
11
|
+
/// Contains metadata fields defined by the OASIS OpenDocument Format standard.
|
|
12
|
+
/// Uses Dublin Core elements (dc:) and OpenDocument meta elements (meta:).
|
|
13
|
+
#[derive(Debug, Clone, Default, PartialEq)]
|
|
14
|
+
pub struct OdtProperties {
|
|
15
|
+
/// Document title (dc:title)
|
|
16
|
+
pub title: Option<String>,
|
|
17
|
+
/// Document subject/topic (dc:subject)
|
|
18
|
+
pub subject: Option<String>,
|
|
19
|
+
/// Current document creator/author (dc:creator)
|
|
20
|
+
pub creator: Option<String>,
|
|
21
|
+
/// Initial creator of the document (meta:initial-creator)
|
|
22
|
+
pub initial_creator: Option<String>,
|
|
23
|
+
/// Keywords or tags (meta:keyword)
|
|
24
|
+
pub keywords: Option<String>,
|
|
25
|
+
/// Document description (dc:description)
|
|
26
|
+
pub description: Option<String>,
|
|
27
|
+
/// Current modification date (dc:date)
|
|
28
|
+
pub date: Option<String>,
|
|
29
|
+
/// Initial creation date (meta:creation-date)
|
|
30
|
+
pub creation_date: Option<String>,
|
|
31
|
+
/// Document language (dc:language)
|
|
32
|
+
pub language: Option<String>,
|
|
33
|
+
/// Generator/application that created the document (meta:generator)
|
|
34
|
+
pub generator: Option<String>,
|
|
35
|
+
/// Editing duration in ISO 8601 format (meta:editing-duration)
|
|
36
|
+
pub editing_duration: Option<String>,
|
|
37
|
+
/// Number of edits/revisions (meta:editing-cycles)
|
|
38
|
+
pub editing_cycles: Option<String>,
|
|
39
|
+
/// Document statistics - page count (meta:page-count)
|
|
40
|
+
pub page_count: Option<i32>,
|
|
41
|
+
/// Document statistics - word count (meta:word-count)
|
|
42
|
+
pub word_count: Option<i32>,
|
|
43
|
+
/// Document statistics - character count (meta:character-count)
|
|
44
|
+
pub character_count: Option<i32>,
|
|
45
|
+
/// Document statistics - paragraph count (meta:paragraph-count)
|
|
46
|
+
pub paragraph_count: Option<i32>,
|
|
47
|
+
/// Document statistics - table count (meta:table-count)
|
|
48
|
+
pub table_count: Option<i32>,
|
|
49
|
+
/// Document statistics - image count (meta:image-count)
|
|
50
|
+
pub image_count: Option<i32>,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/// Extract ODT metadata from an OpenDocument file
|
|
54
|
+
///
|
|
55
|
+
/// Parses `meta.xml` from the ZIP archive and extracts OpenDocument metadata.
|
|
56
|
+
///
|
|
57
|
+
/// # Arguments
|
|
58
|
+
///
|
|
59
|
+
/// * `archive` - ZIP archive containing the OpenDocument file
|
|
60
|
+
///
|
|
61
|
+
/// # Returns
|
|
62
|
+
///
|
|
63
|
+
/// Returns `OdtProperties` with extracted metadata. Fields that are not present
|
|
64
|
+
/// in the document will be `None`.
|
|
65
|
+
///
|
|
66
|
+
/// # Errors
|
|
67
|
+
///
|
|
68
|
+
/// Returns an error if:
|
|
69
|
+
/// - The ZIP archive cannot be read
|
|
70
|
+
/// - The meta.xml file is malformed
|
|
71
|
+
/// - XML parsing fails
|
|
72
|
+
///
|
|
73
|
+
/// # Example
|
|
74
|
+
///
|
|
75
|
+
/// ```no_run
|
|
76
|
+
/// use kreuzberg::extraction::office_metadata::extract_odt_properties;
|
|
77
|
+
/// use std::fs::File;
|
|
78
|
+
/// use zip::ZipArchive;
|
|
79
|
+
///
|
|
80
|
+
/// let file = File::open("document.odt")?;
|
|
81
|
+
/// let mut archive = ZipArchive::new(file)?;
|
|
82
|
+
/// let props = extract_odt_properties(&mut archive)?;
|
|
83
|
+
///
|
|
84
|
+
/// println!("Title: {:?}", props.title);
|
|
85
|
+
/// println!("Creator: {:?}", props.creator);
|
|
86
|
+
/// println!("Created: {:?}", props.creation_date);
|
|
87
|
+
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
|
88
|
+
/// ```
|
|
89
|
+
pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<R>) -> Result<OdtProperties> {
|
|
90
|
+
let mut xml_content = String::new();
|
|
91
|
+
|
|
92
|
+
match archive.by_name("meta.xml") {
|
|
93
|
+
Ok(mut file) => {
|
|
94
|
+
file.read_to_string(&mut xml_content)
|
|
95
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read meta.xml: {}", e)))?;
|
|
96
|
+
}
|
|
97
|
+
Err(_) => {
|
|
98
|
+
return Ok(OdtProperties::default());
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
let doc = roxmltree::Document::parse(&xml_content)
|
|
103
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse meta.xml: {}", e)))?;
|
|
104
|
+
|
|
105
|
+
let root = doc.root_element();
|
|
106
|
+
|
|
107
|
+
let title = super::parse_xml_text(root, "title");
|
|
108
|
+
let subject = super::parse_xml_text(root, "subject");
|
|
109
|
+
let creator = super::parse_xml_text(root, "creator");
|
|
110
|
+
let description = super::parse_xml_text(root, "description");
|
|
111
|
+
let language = super::parse_xml_text(root, "language");
|
|
112
|
+
let date = super::parse_xml_text(root, "date");
|
|
113
|
+
|
|
114
|
+
let initial_creator = super::parse_xml_text(root, "initial-creator");
|
|
115
|
+
let keywords = super::parse_xml_text(root, "keyword");
|
|
116
|
+
let creation_date = super::parse_xml_text(root, "creation-date");
|
|
117
|
+
let generator = super::parse_xml_text(root, "generator");
|
|
118
|
+
let editing_duration = super::parse_xml_text(root, "editing-duration");
|
|
119
|
+
let editing_cycles = super::parse_xml_text(root, "editing-cycles");
|
|
120
|
+
|
|
121
|
+
let page_count = super::parse_xml_int(root, "page-count");
|
|
122
|
+
let word_count = super::parse_xml_int(root, "word-count");
|
|
123
|
+
let character_count = super::parse_xml_int(root, "character-count");
|
|
124
|
+
let paragraph_count = super::parse_xml_int(root, "paragraph-count");
|
|
125
|
+
let table_count = super::parse_xml_int(root, "table-count");
|
|
126
|
+
let image_count = super::parse_xml_int(root, "image-count");
|
|
127
|
+
|
|
128
|
+
Ok(OdtProperties {
|
|
129
|
+
title,
|
|
130
|
+
subject,
|
|
131
|
+
creator,
|
|
132
|
+
initial_creator,
|
|
133
|
+
keywords,
|
|
134
|
+
description,
|
|
135
|
+
date,
|
|
136
|
+
creation_date,
|
|
137
|
+
language,
|
|
138
|
+
generator,
|
|
139
|
+
editing_duration,
|
|
140
|
+
editing_cycles,
|
|
141
|
+
page_count,
|
|
142
|
+
word_count,
|
|
143
|
+
character_count,
|
|
144
|
+
paragraph_count,
|
|
145
|
+
table_count,
|
|
146
|
+
image_count,
|
|
147
|
+
})
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
#[cfg(test)]
|
|
151
|
+
mod tests {
|
|
152
|
+
use super::*;
|
|
153
|
+
use std::io::{Cursor, Write};
|
|
154
|
+
|
|
155
|
+
fn create_test_zip_with_meta_xml(meta_xml: &str) -> ZipArchive<Cursor<Vec<u8>>> {
|
|
156
|
+
let buffer = Vec::new();
|
|
157
|
+
let cursor = Cursor::new(buffer);
|
|
158
|
+
let mut zip = zip::ZipWriter::new(cursor);
|
|
159
|
+
|
|
160
|
+
let options = zip::write::FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
|
|
161
|
+
|
|
162
|
+
zip.start_file("meta.xml", options).unwrap();
|
|
163
|
+
zip.write_all(meta_xml.as_bytes()).unwrap();
|
|
164
|
+
|
|
165
|
+
let cursor = zip.finish().unwrap();
|
|
166
|
+
ZipArchive::new(cursor).unwrap()
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
#[test]
|
|
170
|
+
fn test_extract_odt_properties_full() {
|
|
171
|
+
let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
172
|
+
<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
|
173
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
174
|
+
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
|
175
|
+
office:version="1.3">
|
|
176
|
+
<office:meta>
|
|
177
|
+
<dc:title>Test Document</dc:title>
|
|
178
|
+
<dc:subject>Testing</dc:subject>
|
|
179
|
+
<dc:creator>John Doe</dc:creator>
|
|
180
|
+
<meta:initial-creator>Jane Smith</meta:initial-creator>
|
|
181
|
+
<dc:description>A test document for ODT metadata</dc:description>
|
|
182
|
+
<meta:keyword>test, metadata, odt</meta:keyword>
|
|
183
|
+
<dc:language>en-US</dc:language>
|
|
184
|
+
<meta:creation-date>2024-01-01T10:00:00Z</meta:creation-date>
|
|
185
|
+
<dc:date>2024-01-02T15:30:00Z</dc:date>
|
|
186
|
+
<meta:generator>LibreOffice/24.2</meta:generator>
|
|
187
|
+
<meta:editing-duration>PT2H30M</meta:editing-duration>
|
|
188
|
+
<meta:editing-cycles>5</meta:editing-cycles>
|
|
189
|
+
<meta:page-count>10</meta:page-count>
|
|
190
|
+
<meta:word-count>1500</meta:word-count>
|
|
191
|
+
<meta:character-count>9000</meta:character-count>
|
|
192
|
+
<meta:paragraph-count>45</meta:paragraph-count>
|
|
193
|
+
<meta:table-count>3</meta:table-count>
|
|
194
|
+
<meta:image-count>7</meta:image-count>
|
|
195
|
+
</office:meta>
|
|
196
|
+
</office:document-meta>"#;
|
|
197
|
+
|
|
198
|
+
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
199
|
+
let props = extract_odt_properties(&mut archive).unwrap();
|
|
200
|
+
|
|
201
|
+
assert_eq!(props.title, Some("Test Document".to_string()));
|
|
202
|
+
assert_eq!(props.subject, Some("Testing".to_string()));
|
|
203
|
+
assert_eq!(props.creator, Some("John Doe".to_string()));
|
|
204
|
+
assert_eq!(props.initial_creator, Some("Jane Smith".to_string()));
|
|
205
|
+
assert_eq!(props.keywords, Some("test, metadata, odt".to_string()));
|
|
206
|
+
assert_eq!(props.description, Some("A test document for ODT metadata".to_string()));
|
|
207
|
+
assert_eq!(props.language, Some("en-US".to_string()));
|
|
208
|
+
assert_eq!(props.creation_date, Some("2024-01-01T10:00:00Z".to_string()));
|
|
209
|
+
assert_eq!(props.date, Some("2024-01-02T15:30:00Z".to_string()));
|
|
210
|
+
assert_eq!(props.generator, Some("LibreOffice/24.2".to_string()));
|
|
211
|
+
assert_eq!(props.editing_duration, Some("PT2H30M".to_string()));
|
|
212
|
+
assert_eq!(props.editing_cycles, Some("5".to_string()));
|
|
213
|
+
assert_eq!(props.page_count, Some(10));
|
|
214
|
+
assert_eq!(props.word_count, Some(1500));
|
|
215
|
+
assert_eq!(props.character_count, Some(9000));
|
|
216
|
+
assert_eq!(props.paragraph_count, Some(45));
|
|
217
|
+
assert_eq!(props.table_count, Some(3));
|
|
218
|
+
assert_eq!(props.image_count, Some(7));
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
#[test]
|
|
222
|
+
fn test_extract_odt_properties_minimal() {
|
|
223
|
+
let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
224
|
+
<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
|
225
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
226
|
+
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
|
227
|
+
office:version="1.3">
|
|
228
|
+
<office:meta>
|
|
229
|
+
<dc:creator>Alice</dc:creator>
|
|
230
|
+
<meta:creation-date>2024-01-01T10:00:00Z</meta:creation-date>
|
|
231
|
+
</office:meta>
|
|
232
|
+
</office:document-meta>"#;
|
|
233
|
+
|
|
234
|
+
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
235
|
+
let props = extract_odt_properties(&mut archive).unwrap();
|
|
236
|
+
|
|
237
|
+
assert_eq!(props.creator, Some("Alice".to_string()));
|
|
238
|
+
assert_eq!(props.creation_date, Some("2024-01-01T10:00:00Z".to_string()));
|
|
239
|
+
assert_eq!(props.title, None);
|
|
240
|
+
assert_eq!(props.keywords, None);
|
|
241
|
+
assert_eq!(props.word_count, None);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
#[test]
|
|
245
|
+
fn test_extract_odt_properties_empty_elements() {
|
|
246
|
+
let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
247
|
+
<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
|
248
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
249
|
+
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
|
250
|
+
office:version="1.3">
|
|
251
|
+
<office:meta>
|
|
252
|
+
<dc:title></dc:title>
|
|
253
|
+
<dc:creator>Bob</dc:creator>
|
|
254
|
+
</office:meta>
|
|
255
|
+
</office:document-meta>"#;
|
|
256
|
+
|
|
257
|
+
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
258
|
+
let props = extract_odt_properties(&mut archive).unwrap();
|
|
259
|
+
|
|
260
|
+
assert_eq!(props.title, None);
|
|
261
|
+
assert_eq!(props.creator, Some("Bob".to_string()));
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
#[test]
|
|
265
|
+
fn test_extract_odt_properties_missing_file() {
|
|
266
|
+
let buffer = Vec::new();
|
|
267
|
+
let cursor = Cursor::new(buffer);
|
|
268
|
+
let zip = zip::ZipWriter::new(cursor);
|
|
269
|
+
let cursor = zip.finish().unwrap();
|
|
270
|
+
let mut archive = ZipArchive::new(cursor).unwrap();
|
|
271
|
+
|
|
272
|
+
let props = extract_odt_properties(&mut archive).unwrap();
|
|
273
|
+
assert_eq!(props, OdtProperties::default());
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
#[test]
|
|
277
|
+
fn test_extract_odt_properties_malformed_xml() {
|
|
278
|
+
let meta_xml = "not valid xml <";
|
|
279
|
+
let mut archive = create_test_zip_with_meta_xml(meta_xml);
|
|
280
|
+
|
|
281
|
+
let result = extract_odt_properties(&mut archive);
|
|
282
|
+
assert!(result.is_err());
|
|
283
|
+
}
|
|
284
|
+
}
|