kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +2 -105
- data/README.md +454 -454
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -1
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +7 -80
|
@@ -1,562 +1,574 @@
|
|
|
1
|
-
//! LibreOffice document conversion utilities.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides functions for converting legacy Microsoft Office formats
|
|
4
|
-
//! (.doc, .ppt) to modern formats using LibreOffice's headless conversion mode.
|
|
5
|
-
//!
|
|
6
|
-
//! # Features
|
|
7
|
-
//!
|
|
8
|
-
//! - **Headless conversion**: Uses `soffice --headless` for server-side conversions
|
|
9
|
-
//! - **Timeout protection**: Configurable timeout to prevent hanging conversions
|
|
10
|
-
//! - **Format detection**: Automatic output format based on input file type
|
|
11
|
-
//! - **Error handling**: Distinguishes between missing dependencies and conversion failures
|
|
12
|
-
//!
|
|
13
|
-
//! # Supported Conversions
|
|
14
|
-
//!
|
|
15
|
-
//! - `.doc` → `.docx` (Word documents)
|
|
16
|
-
//! - `.ppt` → `.pptx` (PowerPoint presentations)
|
|
17
|
-
//! - `.xls` → `.xlsx` (Excel spreadsheets) - future support
|
|
18
|
-
//!
|
|
19
|
-
//! # System Requirement
|
|
20
|
-
//!
|
|
21
|
-
//! LibreOffice must be installed and `soffice` must be in PATH:
|
|
22
|
-
//! - **macOS**: `brew install --cask libreoffice`
|
|
23
|
-
//! - **Linux**: `apt install libreoffice` or `dnf install libreoffice`
|
|
24
|
-
//! - **Windows**: `winget install LibreOffice.LibreOffice`
|
|
25
|
-
//!
|
|
26
|
-
//! # Example
|
|
27
|
-
//!
|
|
28
|
-
//! ```rust,no_run
|
|
29
|
-
//! use kreuzberg::extraction::libreoffice::{convert_office_doc, check_libreoffice_available};
|
|
30
|
-
//! use std::path::Path;
|
|
31
|
-
//!
|
|
32
|
-
//! # async fn example() -> kreuzberg::Result<()> {
|
|
33
|
-
//! // Check if LibreOffice is available
|
|
34
|
-
//! let _soffice_path = check_libreoffice_available().await?;
|
|
35
|
-
//!
|
|
36
|
-
//! // Convert .doc to .docx
|
|
37
|
-
//! let input = Path::new("legacy.doc");
|
|
38
|
-
//! let output_dir = Path::new("/tmp");
|
|
39
|
-
//! let converted = convert_office_doc(input, output_dir, "docx", 300).await?;
|
|
40
|
-
//!
|
|
41
|
-
//! println!("Converted {} bytes", converted.len());
|
|
42
|
-
//! # Ok(())
|
|
43
|
-
//! # }
|
|
44
|
-
//! ```
|
|
45
|
-
|
|
46
|
-
use crate::error::{KreuzbergError, Result};
|
|
47
|
-
use crate::types::LibreOfficeConversionResult;
|
|
48
|
-
use std::collections::HashSet;
|
|
49
|
-
use std::env;
|
|
50
|
-
use std::fs as std_fs;
|
|
51
|
-
use std::path::{Path, PathBuf};
|
|
52
|
-
use tokio::fs;
|
|
53
|
-
use tokio::process::Command;
|
|
54
|
-
use tokio::time::{Duration, timeout};
|
|
55
|
-
|
|
56
|
-
/// RAII guard for automatic temporary directory cleanup
|
|
57
|
-
struct TempDir {
|
|
58
|
-
path: PathBuf,
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
impl TempDir {
|
|
62
|
-
async fn new(path: PathBuf) -> Result<Self> {
|
|
63
|
-
fs::create_dir_all(&path).await?;
|
|
64
|
-
Ok(Self { path })
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
impl Drop for TempDir {
|
|
69
|
-
fn drop(&mut self) {
|
|
70
|
-
let path = self.path.clone();
|
|
71
|
-
tokio::spawn(async move {
|
|
72
|
-
let _ = fs::remove_dir_all(&path).await;
|
|
73
|
-
});
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
/// Default timeout for LibreOffice conversion (300 seconds)
|
|
78
|
-
pub const DEFAULT_CONVERSION_TIMEOUT: u64 = 300;
|
|
79
|
-
|
|
80
|
-
fn libreoffice_install_message() -> String {
|
|
81
|
-
"LibreOffice (soffice/libreoffice) is required for legacy MS Office format support (.doc, .ppt). \
|
|
82
|
-
Install: macOS: 'brew install --cask libreoffice', \
|
|
83
|
-
Linux: 'apt install libreoffice', \
|
|
84
|
-
Windows: 'winget install LibreOffice.LibreOffice'. \
|
|
85
|
-
If LibreOffice is installed in a custom location, set the KREUZBERG_LIBREOFFICE_PATH environment variable to the soffice executable."
|
|
86
|
-
.to_string()
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
fn path_to_file_uri(path: &Path) -> String {
|
|
90
|
-
let canonical = std_fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf());
|
|
91
|
-
|
|
92
|
-
#[cfg(windows)]
|
|
93
|
-
{
|
|
94
|
-
let mut normalized = canonical.to_string_lossy().replace('\\', "/");
|
|
95
|
-
if !normalized.starts_with('/') {
|
|
96
|
-
normalized = format!("/{}", normalized);
|
|
97
|
-
}
|
|
98
|
-
format!("file://{}", normalized)
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
#[cfg(not(windows))]
|
|
102
|
-
{
|
|
103
|
-
format!("file://{}", canonical.to_string_lossy())
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
fn soffice_candidates() -> Vec<PathBuf> {
|
|
108
|
-
let mut seen = HashSet::new();
|
|
109
|
-
let mut candidates = Vec::new();
|
|
110
|
-
|
|
111
|
-
let mut push_candidate = |path: PathBuf| {
|
|
112
|
-
if seen.insert(path.clone()) {
|
|
113
|
-
candidates.push(path);
|
|
114
|
-
}
|
|
115
|
-
};
|
|
116
|
-
|
|
117
|
-
for var in ["KREUZBERG_LIBREOFFICE_PATH", "SOFFICE_PATH", "LIBREOFFICE_PATH"] {
|
|
118
|
-
if let Some(value) = env::var_os(var).filter(|v| !v.is_empty()) {
|
|
119
|
-
push_candidate(PathBuf::from(value));
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
if cfg!(target_os = "macos") {
|
|
124
|
-
push_candidate(PathBuf::from("/Applications/LibreOffice.app/Contents/MacOS/soffice"));
|
|
125
|
-
push_candidate(PathBuf::from(
|
|
126
|
-
"/Applications/LibreOffice.app/Contents/MacOS/libreoffice",
|
|
127
|
-
));
|
|
128
|
-
push_candidate(PathBuf::from(
|
|
129
|
-
"/Applications/LibreOffice.app/Contents/MacOS/soffice.bin",
|
|
130
|
-
));
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
if cfg!(target_os = "windows") {
|
|
134
|
-
push_candidate(PathBuf::from("C:\\Program Files\\LibreOffice\\program\\soffice.exe"));
|
|
135
|
-
push_candidate(PathBuf::from(
|
|
136
|
-
"C:\\Program Files\\LibreOffice\\program\\libreoffice.exe",
|
|
137
|
-
));
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
if let Some(prefix) = env::var_os("HOMEBREW_PREFIX") {
|
|
141
|
-
let prefix_path = PathBuf::from(prefix);
|
|
142
|
-
push_candidate(prefix_path.join("bin/soffice"));
|
|
143
|
-
push_candidate(prefix_path.join("bin/libreoffice"));
|
|
144
|
-
push_candidate(prefix_path.join("bin/soffice.exe"));
|
|
145
|
-
push_candidate(prefix_path.join("bin/libreoffice.exe"));
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
if let Some(path_env) = env::var_os("PATH") {
|
|
149
|
-
for dir in env::split_paths(&path_env) {
|
|
150
|
-
push_candidate(dir.join("soffice"));
|
|
151
|
-
push_candidate(dir.join("libreoffice"));
|
|
152
|
-
push_candidate(dir.join("soffice.exe"));
|
|
153
|
-
push_candidate(dir.join("libreoffice.exe"));
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
candidates
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
fn locate_soffice_binary() -> Result<PathBuf> {
|
|
161
|
-
for candidate in soffice_candidates() {
|
|
162
|
-
if candidate.exists()
|
|
163
|
-
&& let Ok(metadata) = std_fs::metadata(&candidate)
|
|
164
|
-
&& metadata.is_file()
|
|
165
|
-
{
|
|
166
|
-
return Ok(candidate);
|
|
167
|
-
}
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
Err(KreuzbergError::MissingDependency(libreoffice_install_message()))
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
/// Check if LibreOffice (soffice/libreoffice) is available and working
|
|
174
|
-
pub async fn check_libreoffice_available() -> Result<PathBuf> {
|
|
175
|
-
let soffice_path = locate_soffice_binary()?;
|
|
176
|
-
|
|
177
|
-
let result = Command::new(&soffice_path).arg("--version").output().await;
|
|
178
|
-
|
|
179
|
-
match result {
|
|
180
|
-
Ok(output) if output.status.success() => Ok(soffice_path),
|
|
181
|
-
Ok(_) => Err(KreuzbergError::MissingDependency(format!(
|
|
182
|
-
"LibreOffice executable '{}' responded with a failure when checking '--version'. \
|
|
183
|
-
Please reinstall LibreOffice.",
|
|
184
|
-
soffice_path.display()
|
|
185
|
-
))),
|
|
186
|
-
Err(err) => Err(KreuzbergError::MissingDependency(format!(
|
|
187
|
-
"LibreOffice executable '{}' could not be executed: {}. {help}",
|
|
188
|
-
soffice_path.display(),
|
|
189
|
-
err,
|
|
190
|
-
help = libreoffice_install_message()
|
|
191
|
-
))),
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
/// Convert an Office document to a target format using LibreOffice
|
|
196
|
-
pub async fn convert_office_doc(
|
|
197
|
-
input_path: &Path,
|
|
198
|
-
output_dir: &Path,
|
|
199
|
-
target_format: &str,
|
|
200
|
-
timeout_seconds: u64,
|
|
201
|
-
) -> Result<Vec<u8>> {
|
|
202
|
-
let soffice_path = check_libreoffice_available().await?;
|
|
203
|
-
|
|
204
|
-
let profile_dir = std::env::temp_dir().join(format!("kreuzberg_lo_profile_{}", uuid::Uuid::new_v4()));
|
|
205
|
-
let _profile_guard = TempDir::new(profile_dir.clone()).await?;
|
|
206
|
-
let user_install_arg = format!("-env:UserInstallation={}", path_to_file_uri(&profile_dir));
|
|
207
|
-
|
|
208
|
-
fs::create_dir_all(output_dir).await?;
|
|
209
|
-
|
|
210
|
-
let mut command = Command::new(&soffice_path);
|
|
211
|
-
command
|
|
212
|
-
.arg("--headless")
|
|
213
|
-
.arg("--nologo")
|
|
214
|
-
.arg("--norestore")
|
|
215
|
-
.arg("--nolockcheck")
|
|
216
|
-
.arg(user_install_arg)
|
|
217
|
-
.arg("--convert-to")
|
|
218
|
-
.arg(target_format)
|
|
219
|
-
.arg("--outdir")
|
|
220
|
-
.arg(output_dir)
|
|
221
|
-
.arg(input_path);
|
|
222
|
-
|
|
223
|
-
let child = command
|
|
224
|
-
.stdout(std::process::Stdio::piped())
|
|
225
|
-
.stderr(std::process::Stdio::piped())
|
|
226
|
-
.spawn()
|
|
227
|
-
.map_err(|e| {
|
|
228
|
-
KreuzbergError::parsing(format!(
|
|
229
|
-
"Failed to execute LibreOffice at '{}': {}",
|
|
230
|
-
soffice_path.display(),
|
|
231
|
-
e
|
|
232
|
-
))
|
|
233
|
-
})?;
|
|
234
|
-
|
|
235
|
-
let child_id = child.id();
|
|
236
|
-
|
|
237
|
-
let output = match timeout(Duration::from_secs(timeout_seconds), child.wait_with_output()).await {
|
|
238
|
-
Ok(Ok(output)) => output,
|
|
239
|
-
Ok(Err(e)) => {
|
|
240
|
-
return Err(KreuzbergError::parsing(format!(
|
|
241
|
-
"Failed to wait for LibreOffice: {}",
|
|
242
|
-
e
|
|
243
|
-
)));
|
|
244
|
-
}
|
|
245
|
-
Err(_) => {
|
|
246
|
-
// Timeout occurred - wait_with_output was cancelled, child is dropped and killed automatically ~keep
|
|
247
|
-
return Err(KreuzbergError::parsing(format!(
|
|
248
|
-
"LibreOffice conversion timed out after {} seconds (PID: {:?})",
|
|
249
|
-
timeout_seconds, child_id
|
|
250
|
-
)));
|
|
251
|
-
}
|
|
252
|
-
};
|
|
253
|
-
|
|
254
|
-
if !output.status.success() {
|
|
255
|
-
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
256
|
-
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
257
|
-
|
|
258
|
-
let mut error_details = format!(
|
|
259
|
-
"LibreOffice process failed with return code {}",
|
|
260
|
-
output.status.code().unwrap_or(-1)
|
|
261
|
-
);
|
|
262
|
-
|
|
263
|
-
if !stderr.is_empty() {
|
|
264
|
-
error_details.push_str(&format!("\nSTDERR: {}", stderr.trim()));
|
|
265
|
-
}
|
|
266
|
-
if !stdout.is_empty() {
|
|
267
|
-
error_details.push_str(&format!("\nSTDOUT: {}", stdout.trim()));
|
|
268
|
-
}
|
|
269
|
-
if stderr.is_empty() && stdout.is_empty() {
|
|
270
|
-
error_details.push_str("\n(no output from LibreOffice process)");
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
// Subprocess error analysis - wrap only if format/parsing error detected ~keep
|
|
274
|
-
let stderr_lower = stderr.to_lowercase();
|
|
275
|
-
let stdout_lower = stdout.to_lowercase();
|
|
276
|
-
let keywords = ["format", "unsupported", "error:", "failed"];
|
|
277
|
-
|
|
278
|
-
if keywords
|
|
279
|
-
.iter()
|
|
280
|
-
.any(|k| stderr_lower.contains(k) || stdout_lower.contains(k))
|
|
281
|
-
{
|
|
282
|
-
return Err(KreuzbergError::parsing(error_details));
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
// True system error - bubble up for user reporting ~keep
|
|
286
|
-
return Err(KreuzbergError::Io(std::io::Error::other(error_details)));
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
let input_stem = input_path
|
|
290
|
-
.file_stem()
|
|
291
|
-
.ok_or_else(|| KreuzbergError::parsing("Invalid input file name".to_string()))?;
|
|
292
|
-
|
|
293
|
-
let expected_output = output_dir.join(format!("{}.{}", input_stem.to_string_lossy(), target_format));
|
|
294
|
-
|
|
295
|
-
let converted_bytes = fs::read(&expected_output).await.map_err(|e| {
|
|
296
|
-
KreuzbergError::parsing(format!(
|
|
297
|
-
"LibreOffice conversion completed but output file not found: {}",
|
|
298
|
-
e
|
|
299
|
-
))
|
|
300
|
-
})?;
|
|
301
|
-
|
|
302
|
-
if converted_bytes.is_empty() {
|
|
303
|
-
return Err(KreuzbergError::parsing(
|
|
304
|
-
"LibreOffice conversion produced empty file".to_string(),
|
|
305
|
-
));
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
Ok(converted_bytes)
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
/// Convert .doc to .docx using LibreOffice
|
|
312
|
-
pub async fn convert_doc_to_docx(doc_bytes: &[u8]) -> Result<LibreOfficeConversionResult> {
|
|
313
|
-
let temp_dir = std::env::temp_dir();
|
|
314
|
-
let unique_id = uuid::Uuid::new_v4();
|
|
315
|
-
let input_dir_path = temp_dir.join(format!("kreuzberg_doc_{}", unique_id));
|
|
316
|
-
let output_dir_path = temp_dir.join(format!("kreuzberg_doc_{}_out", unique_id));
|
|
317
|
-
|
|
318
|
-
// RAII guards ensure cleanup on all paths including panic ~keep
|
|
319
|
-
let _input_guard = TempDir::new(input_dir_path.clone()).await?;
|
|
320
|
-
let _output_guard = TempDir::new(output_dir_path.clone()).await?;
|
|
321
|
-
|
|
322
|
-
let input_path = input_dir_path.join("input.doc");
|
|
323
|
-
fs::write(&input_path, doc_bytes).await?;
|
|
324
|
-
|
|
325
|
-
let converted_bytes = convert_office_doc(&input_path, &output_dir_path, "docx", DEFAULT_CONVERSION_TIMEOUT).await?;
|
|
326
|
-
|
|
327
|
-
Ok(LibreOfficeConversionResult {
|
|
328
|
-
converted_bytes,
|
|
329
|
-
original_format: "doc".to_string(),
|
|
330
|
-
target_format: "docx".to_string(),
|
|
331
|
-
target_mime: crate::core::mime::DOCX_MIME_TYPE.to_string(),
|
|
332
|
-
})
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
/// Convert .ppt to .pptx using LibreOffice
|
|
336
|
-
pub async fn convert_ppt_to_pptx(ppt_bytes: &[u8]) -> Result<LibreOfficeConversionResult> {
|
|
337
|
-
let temp_dir = std::env::temp_dir();
|
|
338
|
-
let unique_id = uuid::Uuid::new_v4();
|
|
339
|
-
let input_dir_path = temp_dir.join(format!("kreuzberg_ppt_{}", unique_id));
|
|
340
|
-
let output_dir_path = temp_dir.join(format!("kreuzberg_ppt_{}_out", unique_id));
|
|
341
|
-
|
|
342
|
-
// RAII guards ensure cleanup on all paths including panic ~keep
|
|
343
|
-
let _input_guard = TempDir::new(input_dir_path.clone()).await?;
|
|
344
|
-
let _output_guard = TempDir::new(output_dir_path.clone()).await?;
|
|
345
|
-
|
|
346
|
-
let input_path = input_dir_path.join("input.ppt");
|
|
347
|
-
fs::write(&input_path, ppt_bytes).await?;
|
|
348
|
-
|
|
349
|
-
let converted_bytes = convert_office_doc(&input_path, &output_dir_path, "pptx", DEFAULT_CONVERSION_TIMEOUT).await?;
|
|
350
|
-
|
|
351
|
-
Ok(LibreOfficeConversionResult {
|
|
352
|
-
converted_bytes,
|
|
353
|
-
original_format: "ppt".to_string(),
|
|
354
|
-
target_format: "pptx".to_string(),
|
|
355
|
-
target_mime: crate::core::mime::POWER_POINT_MIME_TYPE.to_string(),
|
|
356
|
-
})
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
#[cfg(test)]
|
|
360
|
-
mod tests {
|
|
361
|
-
use super::*;
|
|
362
|
-
|
|
363
|
-
#[tokio::test]
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
let
|
|
381
|
-
|
|
382
|
-
let
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
let
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
let
|
|
456
|
-
let
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
let
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
let
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
#[
|
|
520
|
-
async fn
|
|
521
|
-
if check_libreoffice_available().await.is_err() {
|
|
522
|
-
return;
|
|
523
|
-
}
|
|
524
|
-
|
|
525
|
-
let
|
|
526
|
-
let _result =
|
|
527
|
-
}
|
|
528
|
-
|
|
529
|
-
#[tokio::test]
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
let
|
|
537
|
-
let
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
1
|
+
//! LibreOffice document conversion utilities.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides functions for converting legacy Microsoft Office formats
|
|
4
|
+
//! (.doc, .ppt) to modern formats using LibreOffice's headless conversion mode.
|
|
5
|
+
//!
|
|
6
|
+
//! # Features
|
|
7
|
+
//!
|
|
8
|
+
//! - **Headless conversion**: Uses `soffice --headless` for server-side conversions
|
|
9
|
+
//! - **Timeout protection**: Configurable timeout to prevent hanging conversions
|
|
10
|
+
//! - **Format detection**: Automatic output format based on input file type
|
|
11
|
+
//! - **Error handling**: Distinguishes between missing dependencies and conversion failures
|
|
12
|
+
//!
|
|
13
|
+
//! # Supported Conversions
|
|
14
|
+
//!
|
|
15
|
+
//! - `.doc` → `.docx` (Word documents)
|
|
16
|
+
//! - `.ppt` → `.pptx` (PowerPoint presentations)
|
|
17
|
+
//! - `.xls` → `.xlsx` (Excel spreadsheets) - future support
|
|
18
|
+
//!
|
|
19
|
+
//! # System Requirement
|
|
20
|
+
//!
|
|
21
|
+
//! LibreOffice must be installed and `soffice` must be in PATH:
|
|
22
|
+
//! - **macOS**: `brew install --cask libreoffice`
|
|
23
|
+
//! - **Linux**: `apt install libreoffice` or `dnf install libreoffice`
|
|
24
|
+
//! - **Windows**: `winget install LibreOffice.LibreOffice`
|
|
25
|
+
//!
|
|
26
|
+
//! # Example
|
|
27
|
+
//!
|
|
28
|
+
//! ```rust,no_run
|
|
29
|
+
//! use kreuzberg::extraction::libreoffice::{convert_office_doc, check_libreoffice_available};
|
|
30
|
+
//! use std::path::Path;
|
|
31
|
+
//!
|
|
32
|
+
//! # async fn example() -> kreuzberg::Result<()> {
|
|
33
|
+
//! // Check if LibreOffice is available
|
|
34
|
+
//! let _soffice_path = check_libreoffice_available().await?;
|
|
35
|
+
//!
|
|
36
|
+
//! // Convert .doc to .docx
|
|
37
|
+
//! let input = Path::new("legacy.doc");
|
|
38
|
+
//! let output_dir = Path::new("/tmp");
|
|
39
|
+
//! let converted = convert_office_doc(input, output_dir, "docx", 300).await?;
|
|
40
|
+
//!
|
|
41
|
+
//! println!("Converted {} bytes", converted.len());
|
|
42
|
+
//! # Ok(())
|
|
43
|
+
//! # }
|
|
44
|
+
//! ```
|
|
45
|
+
|
|
46
|
+
use crate::error::{KreuzbergError, Result};
|
|
47
|
+
use crate::types::LibreOfficeConversionResult;
|
|
48
|
+
use std::collections::HashSet;
|
|
49
|
+
use std::env;
|
|
50
|
+
use std::fs as std_fs;
|
|
51
|
+
use std::path::{Path, PathBuf};
|
|
52
|
+
use tokio::fs;
|
|
53
|
+
use tokio::process::Command;
|
|
54
|
+
use tokio::time::{Duration, timeout};
|
|
55
|
+
|
|
56
|
+
/// RAII guard for automatic temporary directory cleanup
|
|
57
|
+
struct TempDir {
|
|
58
|
+
path: PathBuf,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
impl TempDir {
|
|
62
|
+
async fn new(path: PathBuf) -> Result<Self> {
|
|
63
|
+
fs::create_dir_all(&path).await?;
|
|
64
|
+
Ok(Self { path })
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
impl Drop for TempDir {
|
|
69
|
+
fn drop(&mut self) {
|
|
70
|
+
let path = self.path.clone();
|
|
71
|
+
tokio::spawn(async move {
|
|
72
|
+
let _ = fs::remove_dir_all(&path).await;
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/// Default timeout for LibreOffice conversion (300 seconds)
|
|
78
|
+
pub const DEFAULT_CONVERSION_TIMEOUT: u64 = 300;
|
|
79
|
+
|
|
80
|
+
fn libreoffice_install_message() -> String {
|
|
81
|
+
"LibreOffice (soffice/libreoffice) is required for legacy MS Office format support (.doc, .ppt). \
|
|
82
|
+
Install: macOS: 'brew install --cask libreoffice', \
|
|
83
|
+
Linux: 'apt install libreoffice', \
|
|
84
|
+
Windows: 'winget install LibreOffice.LibreOffice'. \
|
|
85
|
+
If LibreOffice is installed in a custom location, set the KREUZBERG_LIBREOFFICE_PATH environment variable to the soffice executable."
|
|
86
|
+
.to_string()
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
fn path_to_file_uri(path: &Path) -> String {
|
|
90
|
+
let canonical = std_fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf());
|
|
91
|
+
|
|
92
|
+
#[cfg(windows)]
|
|
93
|
+
{
|
|
94
|
+
let mut normalized = canonical.to_string_lossy().replace('\\', "/");
|
|
95
|
+
if !normalized.starts_with('/') {
|
|
96
|
+
normalized = format!("/{}", normalized);
|
|
97
|
+
}
|
|
98
|
+
format!("file://{}", normalized)
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
#[cfg(not(windows))]
|
|
102
|
+
{
|
|
103
|
+
format!("file://{}", canonical.to_string_lossy())
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
fn soffice_candidates() -> Vec<PathBuf> {
|
|
108
|
+
let mut seen = HashSet::new();
|
|
109
|
+
let mut candidates = Vec::new();
|
|
110
|
+
|
|
111
|
+
let mut push_candidate = |path: PathBuf| {
|
|
112
|
+
if seen.insert(path.clone()) {
|
|
113
|
+
candidates.push(path);
|
|
114
|
+
}
|
|
115
|
+
};
|
|
116
|
+
|
|
117
|
+
for var in ["KREUZBERG_LIBREOFFICE_PATH", "SOFFICE_PATH", "LIBREOFFICE_PATH"] {
|
|
118
|
+
if let Some(value) = env::var_os(var).filter(|v| !v.is_empty()) {
|
|
119
|
+
push_candidate(PathBuf::from(value));
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if cfg!(target_os = "macos") {
|
|
124
|
+
push_candidate(PathBuf::from("/Applications/LibreOffice.app/Contents/MacOS/soffice"));
|
|
125
|
+
push_candidate(PathBuf::from(
|
|
126
|
+
"/Applications/LibreOffice.app/Contents/MacOS/libreoffice",
|
|
127
|
+
));
|
|
128
|
+
push_candidate(PathBuf::from(
|
|
129
|
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice.bin",
|
|
130
|
+
));
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if cfg!(target_os = "windows") {
|
|
134
|
+
push_candidate(PathBuf::from("C:\\Program Files\\LibreOffice\\program\\soffice.exe"));
|
|
135
|
+
push_candidate(PathBuf::from(
|
|
136
|
+
"C:\\Program Files\\LibreOffice\\program\\libreoffice.exe",
|
|
137
|
+
));
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if let Some(prefix) = env::var_os("HOMEBREW_PREFIX") {
|
|
141
|
+
let prefix_path = PathBuf::from(prefix);
|
|
142
|
+
push_candidate(prefix_path.join("bin/soffice"));
|
|
143
|
+
push_candidate(prefix_path.join("bin/libreoffice"));
|
|
144
|
+
push_candidate(prefix_path.join("bin/soffice.exe"));
|
|
145
|
+
push_candidate(prefix_path.join("bin/libreoffice.exe"));
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if let Some(path_env) = env::var_os("PATH") {
|
|
149
|
+
for dir in env::split_paths(&path_env) {
|
|
150
|
+
push_candidate(dir.join("soffice"));
|
|
151
|
+
push_candidate(dir.join("libreoffice"));
|
|
152
|
+
push_candidate(dir.join("soffice.exe"));
|
|
153
|
+
push_candidate(dir.join("libreoffice.exe"));
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
candidates
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
fn locate_soffice_binary() -> Result<PathBuf> {
|
|
161
|
+
for candidate in soffice_candidates() {
|
|
162
|
+
if candidate.exists()
|
|
163
|
+
&& let Ok(metadata) = std_fs::metadata(&candidate)
|
|
164
|
+
&& metadata.is_file()
|
|
165
|
+
{
|
|
166
|
+
return Ok(candidate);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
Err(KreuzbergError::MissingDependency(libreoffice_install_message()))
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/// Check if LibreOffice (soffice/libreoffice) is available and working
|
|
174
|
+
pub async fn check_libreoffice_available() -> Result<PathBuf> {
|
|
175
|
+
let soffice_path = locate_soffice_binary()?;
|
|
176
|
+
|
|
177
|
+
let result = Command::new(&soffice_path).arg("--version").output().await;
|
|
178
|
+
|
|
179
|
+
match result {
|
|
180
|
+
Ok(output) if output.status.success() => Ok(soffice_path),
|
|
181
|
+
Ok(_) => Err(KreuzbergError::MissingDependency(format!(
|
|
182
|
+
"LibreOffice executable '{}' responded with a failure when checking '--version'. \
|
|
183
|
+
Please reinstall LibreOffice.",
|
|
184
|
+
soffice_path.display()
|
|
185
|
+
))),
|
|
186
|
+
Err(err) => Err(KreuzbergError::MissingDependency(format!(
|
|
187
|
+
"LibreOffice executable '{}' could not be executed: {}. {help}",
|
|
188
|
+
soffice_path.display(),
|
|
189
|
+
err,
|
|
190
|
+
help = libreoffice_install_message()
|
|
191
|
+
))),
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/// Convert an Office document to a target format using LibreOffice
|
|
196
|
+
pub async fn convert_office_doc(
|
|
197
|
+
input_path: &Path,
|
|
198
|
+
output_dir: &Path,
|
|
199
|
+
target_format: &str,
|
|
200
|
+
timeout_seconds: u64,
|
|
201
|
+
) -> Result<Vec<u8>> {
|
|
202
|
+
let soffice_path = check_libreoffice_available().await?;
|
|
203
|
+
|
|
204
|
+
let profile_dir = std::env::temp_dir().join(format!("kreuzberg_lo_profile_{}", uuid::Uuid::new_v4()));
|
|
205
|
+
let _profile_guard = TempDir::new(profile_dir.clone()).await?;
|
|
206
|
+
let user_install_arg = format!("-env:UserInstallation={}", path_to_file_uri(&profile_dir));
|
|
207
|
+
|
|
208
|
+
fs::create_dir_all(output_dir).await?;
|
|
209
|
+
|
|
210
|
+
let mut command = Command::new(&soffice_path);
|
|
211
|
+
command
|
|
212
|
+
.arg("--headless")
|
|
213
|
+
.arg("--nologo")
|
|
214
|
+
.arg("--norestore")
|
|
215
|
+
.arg("--nolockcheck")
|
|
216
|
+
.arg(user_install_arg)
|
|
217
|
+
.arg("--convert-to")
|
|
218
|
+
.arg(target_format)
|
|
219
|
+
.arg("--outdir")
|
|
220
|
+
.arg(output_dir)
|
|
221
|
+
.arg(input_path);
|
|
222
|
+
|
|
223
|
+
let child = command
|
|
224
|
+
.stdout(std::process::Stdio::piped())
|
|
225
|
+
.stderr(std::process::Stdio::piped())
|
|
226
|
+
.spawn()
|
|
227
|
+
.map_err(|e| {
|
|
228
|
+
KreuzbergError::parsing(format!(
|
|
229
|
+
"Failed to execute LibreOffice at '{}': {}",
|
|
230
|
+
soffice_path.display(),
|
|
231
|
+
e
|
|
232
|
+
))
|
|
233
|
+
})?;
|
|
234
|
+
|
|
235
|
+
let child_id = child.id();
|
|
236
|
+
|
|
237
|
+
let output = match timeout(Duration::from_secs(timeout_seconds), child.wait_with_output()).await {
|
|
238
|
+
Ok(Ok(output)) => output,
|
|
239
|
+
Ok(Err(e)) => {
|
|
240
|
+
return Err(KreuzbergError::parsing(format!(
|
|
241
|
+
"Failed to wait for LibreOffice: {}",
|
|
242
|
+
e
|
|
243
|
+
)));
|
|
244
|
+
}
|
|
245
|
+
Err(_) => {
|
|
246
|
+
// Timeout occurred - wait_with_output was cancelled, child is dropped and killed automatically ~keep
|
|
247
|
+
return Err(KreuzbergError::parsing(format!(
|
|
248
|
+
"LibreOffice conversion timed out after {} seconds (PID: {:?})",
|
|
249
|
+
timeout_seconds, child_id
|
|
250
|
+
)));
|
|
251
|
+
}
|
|
252
|
+
};
|
|
253
|
+
|
|
254
|
+
if !output.status.success() {
|
|
255
|
+
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
256
|
+
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
257
|
+
|
|
258
|
+
let mut error_details = format!(
|
|
259
|
+
"LibreOffice process failed with return code {}",
|
|
260
|
+
output.status.code().unwrap_or(-1)
|
|
261
|
+
);
|
|
262
|
+
|
|
263
|
+
if !stderr.is_empty() {
|
|
264
|
+
error_details.push_str(&format!("\nSTDERR: {}", stderr.trim()));
|
|
265
|
+
}
|
|
266
|
+
if !stdout.is_empty() {
|
|
267
|
+
error_details.push_str(&format!("\nSTDOUT: {}", stdout.trim()));
|
|
268
|
+
}
|
|
269
|
+
if stderr.is_empty() && stdout.is_empty() {
|
|
270
|
+
error_details.push_str("\n(no output from LibreOffice process)");
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Subprocess error analysis - wrap only if format/parsing error detected ~keep
|
|
274
|
+
let stderr_lower = stderr.to_lowercase();
|
|
275
|
+
let stdout_lower = stdout.to_lowercase();
|
|
276
|
+
let keywords = ["format", "unsupported", "error:", "failed"];
|
|
277
|
+
|
|
278
|
+
if keywords
|
|
279
|
+
.iter()
|
|
280
|
+
.any(|k| stderr_lower.contains(k) || stdout_lower.contains(k))
|
|
281
|
+
{
|
|
282
|
+
return Err(KreuzbergError::parsing(error_details));
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// True system error - bubble up for user reporting ~keep
|
|
286
|
+
return Err(KreuzbergError::Io(std::io::Error::other(error_details)));
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
let input_stem = input_path
|
|
290
|
+
.file_stem()
|
|
291
|
+
.ok_or_else(|| KreuzbergError::parsing("Invalid input file name".to_string()))?;
|
|
292
|
+
|
|
293
|
+
let expected_output = output_dir.join(format!("{}.{}", input_stem.to_string_lossy(), target_format));
|
|
294
|
+
|
|
295
|
+
let converted_bytes = fs::read(&expected_output).await.map_err(|e| {
|
|
296
|
+
KreuzbergError::parsing(format!(
|
|
297
|
+
"LibreOffice conversion completed but output file not found: {}",
|
|
298
|
+
e
|
|
299
|
+
))
|
|
300
|
+
})?;
|
|
301
|
+
|
|
302
|
+
if converted_bytes.is_empty() {
|
|
303
|
+
return Err(KreuzbergError::parsing(
|
|
304
|
+
"LibreOffice conversion produced empty file".to_string(),
|
|
305
|
+
));
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
Ok(converted_bytes)
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
/// Convert .doc to .docx using LibreOffice
|
|
312
|
+
pub async fn convert_doc_to_docx(doc_bytes: &[u8]) -> Result<LibreOfficeConversionResult> {
|
|
313
|
+
let temp_dir = std::env::temp_dir();
|
|
314
|
+
let unique_id = uuid::Uuid::new_v4();
|
|
315
|
+
let input_dir_path = temp_dir.join(format!("kreuzberg_doc_{}", unique_id));
|
|
316
|
+
let output_dir_path = temp_dir.join(format!("kreuzberg_doc_{}_out", unique_id));
|
|
317
|
+
|
|
318
|
+
// RAII guards ensure cleanup on all paths including panic ~keep
|
|
319
|
+
let _input_guard = TempDir::new(input_dir_path.clone()).await?;
|
|
320
|
+
let _output_guard = TempDir::new(output_dir_path.clone()).await?;
|
|
321
|
+
|
|
322
|
+
let input_path = input_dir_path.join("input.doc");
|
|
323
|
+
fs::write(&input_path, doc_bytes).await?;
|
|
324
|
+
|
|
325
|
+
let converted_bytes = convert_office_doc(&input_path, &output_dir_path, "docx", DEFAULT_CONVERSION_TIMEOUT).await?;
|
|
326
|
+
|
|
327
|
+
Ok(LibreOfficeConversionResult {
|
|
328
|
+
converted_bytes,
|
|
329
|
+
original_format: "doc".to_string(),
|
|
330
|
+
target_format: "docx".to_string(),
|
|
331
|
+
target_mime: crate::core::mime::DOCX_MIME_TYPE.to_string(),
|
|
332
|
+
})
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/// Convert .ppt to .pptx using LibreOffice
|
|
336
|
+
pub async fn convert_ppt_to_pptx(ppt_bytes: &[u8]) -> Result<LibreOfficeConversionResult> {
|
|
337
|
+
let temp_dir = std::env::temp_dir();
|
|
338
|
+
let unique_id = uuid::Uuid::new_v4();
|
|
339
|
+
let input_dir_path = temp_dir.join(format!("kreuzberg_ppt_{}", unique_id));
|
|
340
|
+
let output_dir_path = temp_dir.join(format!("kreuzberg_ppt_{}_out", unique_id));
|
|
341
|
+
|
|
342
|
+
// RAII guards ensure cleanup on all paths including panic ~keep
|
|
343
|
+
let _input_guard = TempDir::new(input_dir_path.clone()).await?;
|
|
344
|
+
let _output_guard = TempDir::new(output_dir_path.clone()).await?;
|
|
345
|
+
|
|
346
|
+
let input_path = input_dir_path.join("input.ppt");
|
|
347
|
+
fs::write(&input_path, ppt_bytes).await?;
|
|
348
|
+
|
|
349
|
+
let converted_bytes = convert_office_doc(&input_path, &output_dir_path, "pptx", DEFAULT_CONVERSION_TIMEOUT).await?;
|
|
350
|
+
|
|
351
|
+
Ok(LibreOfficeConversionResult {
|
|
352
|
+
converted_bytes,
|
|
353
|
+
original_format: "ppt".to_string(),
|
|
354
|
+
target_format: "pptx".to_string(),
|
|
355
|
+
target_mime: crate::core::mime::POWER_POINT_MIME_TYPE.to_string(),
|
|
356
|
+
})
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
#[cfg(test)]
|
|
360
|
+
mod tests {
|
|
361
|
+
use super::*;
|
|
362
|
+
|
|
363
|
+
#[tokio::test]
|
|
364
|
+
#[cfg(not(target_os = "windows"))]
|
|
365
|
+
async fn test_check_libreoffice_available() {
|
|
366
|
+
let result = check_libreoffice_available().await;
|
|
367
|
+
if result.is_err() {
|
|
368
|
+
return;
|
|
369
|
+
}
|
|
370
|
+
assert!(result.is_ok());
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
#[tokio::test]
|
|
374
|
+
#[cfg(not(target_os = "windows"))]
|
|
375
|
+
async fn test_convert_office_doc_missing_file() {
|
|
376
|
+
if check_libreoffice_available().await.is_err() {
|
|
377
|
+
return;
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
let temp_dir = std::env::temp_dir();
|
|
381
|
+
let output_dir = temp_dir.join("test_convert_office_doc_missing_file");
|
|
382
|
+
let non_existent = Path::new("/tmp/nonexistent.doc");
|
|
383
|
+
|
|
384
|
+
let result = convert_office_doc(non_existent, &output_dir, "docx", 10).await;
|
|
385
|
+
|
|
386
|
+
assert!(result.is_err());
|
|
387
|
+
let _ = fs::remove_dir_all(&output_dir).await;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
#[test]
|
|
391
|
+
fn test_default_conversion_timeout_value() {
|
|
392
|
+
assert_eq!(DEFAULT_CONVERSION_TIMEOUT, 300);
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
#[tokio::test]
|
|
396
|
+
#[cfg(not(target_os = "windows"))]
|
|
397
|
+
async fn test_convert_doc_to_docx_empty_bytes() {
|
|
398
|
+
if check_libreoffice_available().await.is_err() {
|
|
399
|
+
return;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
let empty_bytes = b"";
|
|
403
|
+
let result = convert_doc_to_docx(empty_bytes).await;
|
|
404
|
+
|
|
405
|
+
let _ = result;
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
#[tokio::test]
|
|
409
|
+
#[cfg(not(target_os = "windows"))]
|
|
410
|
+
async fn test_convert_ppt_to_pptx_empty_bytes() {
|
|
411
|
+
if check_libreoffice_available().await.is_err() {
|
|
412
|
+
return;
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
let empty_bytes = b"";
|
|
416
|
+
let result = convert_ppt_to_pptx(empty_bytes).await;
|
|
417
|
+
|
|
418
|
+
let _ = result;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
#[tokio::test]
|
|
422
|
+
#[cfg(not(target_os = "windows"))]
|
|
423
|
+
async fn test_convert_doc_to_docx_invalid_doc() {
|
|
424
|
+
if check_libreoffice_available().await.is_err() {
|
|
425
|
+
return;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
let invalid_doc = b"This is not a valid .doc file";
|
|
429
|
+
let result = convert_doc_to_docx(invalid_doc).await;
|
|
430
|
+
|
|
431
|
+
let _ = result;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
#[tokio::test]
|
|
435
|
+
#[cfg(not(target_os = "windows"))]
|
|
436
|
+
async fn test_convert_ppt_to_pptx_invalid_ppt() {
|
|
437
|
+
if check_libreoffice_available().await.is_err() {
|
|
438
|
+
return;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
let invalid_ppt = b"This is not a valid .ppt file";
|
|
442
|
+
let result = convert_ppt_to_pptx(invalid_ppt).await;
|
|
443
|
+
|
|
444
|
+
assert!(result.is_err());
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
#[tokio::test]
|
|
448
|
+
#[cfg(not(target_os = "windows"))]
|
|
449
|
+
async fn test_convert_office_doc_invalid_target_format() {
|
|
450
|
+
if check_libreoffice_available().await.is_err() {
|
|
451
|
+
return;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
let temp_dir = std::env::temp_dir();
|
|
455
|
+
let input_path = temp_dir.join("test_input.txt");
|
|
456
|
+
let output_dir = temp_dir.join("test_output_invalid_format");
|
|
457
|
+
|
|
458
|
+
fs::write(&input_path, b"test content").await.unwrap();
|
|
459
|
+
|
|
460
|
+
let result = convert_office_doc(&input_path, &output_dir, "invalid_format", 10).await;
|
|
461
|
+
|
|
462
|
+
let _ = fs::remove_file(&input_path).await;
|
|
463
|
+
let _ = fs::remove_dir_all(&output_dir).await;
|
|
464
|
+
|
|
465
|
+
let _ = result;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
#[tokio::test]
|
|
469
|
+
#[cfg(not(target_os = "windows"))]
|
|
470
|
+
async fn test_check_libreoffice_missing_dependency_error() {
|
|
471
|
+
let result = check_libreoffice_available().await;
|
|
472
|
+
|
|
473
|
+
if let Err(err) = result {
|
|
474
|
+
match err {
|
|
475
|
+
KreuzbergError::MissingDependency(msg) => {
|
|
476
|
+
assert!(msg.contains("LibreOffice") || msg.contains("soffice"));
|
|
477
|
+
}
|
|
478
|
+
_ => panic!("Expected MissingDependency error"),
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
#[tokio::test]
|
|
484
|
+
#[cfg(not(target_os = "windows"))]
|
|
485
|
+
async fn test_convert_office_doc_creates_output_dir() {
|
|
486
|
+
if check_libreoffice_available().await.is_err() {
|
|
487
|
+
return;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
let temp_dir = std::env::temp_dir();
|
|
491
|
+
let output_dir = temp_dir.join(format!("test_create_output_{}", uuid::Uuid::new_v4()));
|
|
492
|
+
|
|
493
|
+
assert!(!output_dir.exists());
|
|
494
|
+
|
|
495
|
+
let input_path = temp_dir.join("test_create_output.txt");
|
|
496
|
+
fs::write(&input_path, b"test").await.unwrap();
|
|
497
|
+
|
|
498
|
+
let _ = convert_office_doc(&input_path, &output_dir, "pdf", 10).await;
|
|
499
|
+
|
|
500
|
+
let _ = fs::remove_file(&input_path).await;
|
|
501
|
+
let _ = fs::remove_dir_all(&output_dir).await;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
#[tokio::test]
|
|
505
|
+
async fn test_conversion_result_structure() {
|
|
506
|
+
let result = LibreOfficeConversionResult {
|
|
507
|
+
converted_bytes: vec![1, 2, 3],
|
|
508
|
+
original_format: "doc".to_string(),
|
|
509
|
+
target_format: "docx".to_string(),
|
|
510
|
+
target_mime: crate::core::mime::DOCX_MIME_TYPE.to_string(),
|
|
511
|
+
};
|
|
512
|
+
|
|
513
|
+
assert_eq!(result.original_format, "doc");
|
|
514
|
+
assert_eq!(result.target_format, "docx");
|
|
515
|
+
assert_eq!(result.converted_bytes.len(), 3);
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
#[tokio::test]
|
|
519
|
+
#[cfg(not(target_os = "windows"))]
|
|
520
|
+
async fn test_convert_doc_to_docx_temp_cleanup() {
|
|
521
|
+
if check_libreoffice_available().await.is_err() {
|
|
522
|
+
return;
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
let invalid_doc = b"invalid doc content";
|
|
526
|
+
let _result = convert_doc_to_docx(invalid_doc).await;
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
#[tokio::test]
|
|
530
|
+
#[cfg(not(target_os = "windows"))]
|
|
531
|
+
async fn test_convert_ppt_to_pptx_temp_cleanup() {
|
|
532
|
+
if check_libreoffice_available().await.is_err() {
|
|
533
|
+
return;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
let invalid_ppt = b"invalid ppt content";
|
|
537
|
+
let _result = convert_ppt_to_pptx(invalid_ppt).await;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
#[tokio::test]
|
|
541
|
+
#[cfg(not(target_os = "windows"))]
|
|
542
|
+
async fn test_convert_office_doc_timeout_kills_process() {
|
|
543
|
+
if check_libreoffice_available().await.is_err() {
|
|
544
|
+
return;
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
let temp_dir = std::env::temp_dir();
|
|
548
|
+
let input_path = temp_dir.join("test_timeout_input.txt");
|
|
549
|
+
let output_dir = temp_dir.join("test_timeout_output");
|
|
550
|
+
|
|
551
|
+
fs::write(&input_path, b"test content").await.unwrap();
|
|
552
|
+
|
|
553
|
+
let result = convert_office_doc(&input_path, &output_dir, "pdf", 0).await;
|
|
554
|
+
|
|
555
|
+
assert!(result.is_err());
|
|
556
|
+
|
|
557
|
+
let _ = fs::remove_file(&input_path).await;
|
|
558
|
+
let _ = fs::remove_dir_all(&output_dir).await;
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
#[tokio::test]
|
|
562
|
+
async fn test_tempdir_raii_cleanup_on_error() {
|
|
563
|
+
let temp_path = std::env::temp_dir().join(format!("test_raii_{}", uuid::Uuid::new_v4()));
|
|
564
|
+
|
|
565
|
+
{
|
|
566
|
+
let _guard = TempDir::new(temp_path.clone()).await.unwrap();
|
|
567
|
+
assert!(temp_path.exists());
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
|
571
|
+
|
|
572
|
+
assert!(!temp_path.exists() || fs::read_dir(&temp_path).await.is_err());
|
|
573
|
+
}
|
|
574
|
+
}
|