kreuzberg 4.0.0.rc2 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +396 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
data/vendor/kreuzberg/build.rs
CHANGED
|
@@ -1,474 +1,782 @@
|
|
|
1
|
-
use std::env;
|
|
2
|
-
use std::fs;
|
|
3
|
-
use std::io;
|
|
4
|
-
use std::path::{Path, PathBuf};
|
|
5
|
-
use std::process::Command;
|
|
6
|
-
use std::thread;
|
|
7
|
-
use std::time::Duration;
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
("
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
.
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
.
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
} else {
|
|
277
|
-
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
if
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
"
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
);
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
.
|
|
459
|
-
|
|
460
|
-
.
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
1
|
+
use std::env;
|
|
2
|
+
use std::fs;
|
|
3
|
+
use std::io;
|
|
4
|
+
use std::path::{Path, PathBuf};
|
|
5
|
+
use std::process::Command;
|
|
6
|
+
use std::thread;
|
|
7
|
+
use std::time::Duration;
|
|
8
|
+
|
|
9
|
+
/// PDFium linking strategy
|
|
10
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
11
|
+
enum PdfiumLinkStrategy {
|
|
12
|
+
/// Download and link statically (static-pdfium feature)
|
|
13
|
+
DownloadStatic,
|
|
14
|
+
/// Download, link dynamically, and embed in binary (bundled-pdfium feature)
|
|
15
|
+
Bundled,
|
|
16
|
+
/// Use system-installed pdfium via pkg-config (system-pdfium feature)
|
|
17
|
+
System,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
fn main() {
|
|
21
|
+
let target = env::var("TARGET").unwrap();
|
|
22
|
+
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
|
|
23
|
+
|
|
24
|
+
println!("cargo::rustc-check-cfg=cfg(coverage)");
|
|
25
|
+
|
|
26
|
+
if !cfg!(feature = "pdf") {
|
|
27
|
+
tracing::debug!("PDF feature not enabled, skipping pdfium linking");
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
let strategy = determine_link_strategy(&target);
|
|
32
|
+
|
|
33
|
+
tracing::debug!("Using PDFium linking strategy: {:?}", strategy);
|
|
34
|
+
|
|
35
|
+
match strategy {
|
|
36
|
+
PdfiumLinkStrategy::DownloadStatic => {
|
|
37
|
+
let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
|
|
38
|
+
link_statically(&pdfium_dir, &target);
|
|
39
|
+
}
|
|
40
|
+
PdfiumLinkStrategy::Bundled => {
|
|
41
|
+
let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
|
|
42
|
+
link_bundled(&pdfium_dir, &target, &out_dir);
|
|
43
|
+
}
|
|
44
|
+
PdfiumLinkStrategy::System => {
|
|
45
|
+
link_system(&target);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
link_system_frameworks(&target);
|
|
50
|
+
println!("cargo:rerun-if-changed=build.rs");
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/// Determine which linking strategy to use based on features and target
|
|
54
|
+
fn determine_link_strategy(target: &str) -> PdfiumLinkStrategy {
|
|
55
|
+
if target.contains("wasm") {
|
|
56
|
+
if let Ok(wasm_lib) = env::var("PDFIUM_WASM_LIB") {
|
|
57
|
+
println!("cargo:rustc-link-search=native={}", wasm_lib);
|
|
58
|
+
println!("cargo:rustc-link-lib=static=pdfium");
|
|
59
|
+
return PdfiumLinkStrategy::DownloadStatic;
|
|
60
|
+
}
|
|
61
|
+
println!("cargo:warning=WASM build using bundled PDFium (set PDFIUM_WASM_LIB to link custom WASM PDFium)");
|
|
62
|
+
return PdfiumLinkStrategy::Bundled;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
let system_pdfium = cfg!(feature = "system-pdfium");
|
|
66
|
+
let bundled_pdfium = cfg!(feature = "bundled-pdfium");
|
|
67
|
+
let static_pdfium = cfg!(feature = "static-pdfium");
|
|
68
|
+
|
|
69
|
+
let enabled_count = usize::from(system_pdfium) + usize::from(bundled_pdfium) + usize::from(static_pdfium);
|
|
70
|
+
if enabled_count > 1 {
|
|
71
|
+
println!(
|
|
72
|
+
"cargo:warning=Multiple PDFium linking strategies enabled (static-pdfium={}, bundled-pdfium={}, system-pdfium={}); using bundled-pdfium for this build",
|
|
73
|
+
static_pdfium, bundled_pdfium, system_pdfium
|
|
74
|
+
);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if bundled_pdfium {
|
|
78
|
+
return PdfiumLinkStrategy::Bundled;
|
|
79
|
+
}
|
|
80
|
+
if system_pdfium {
|
|
81
|
+
return PdfiumLinkStrategy::System;
|
|
82
|
+
}
|
|
83
|
+
if static_pdfium {
|
|
84
|
+
return PdfiumLinkStrategy::DownloadStatic;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
PdfiumLinkStrategy::Bundled
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/// Download PDFium or use prebuilt directory
|
|
91
|
+
///
|
|
92
|
+
/// This is the main orchestrator function that:
|
|
93
|
+
/// 1. Checks for `KREUZBERG_PDFIUM_PREBUILT` environment variable
|
|
94
|
+
/// 2. If set and valid, uses prebuilt pdfium directory
|
|
95
|
+
/// 3. If not set, downloads pdfium to out_dir (with caching)
|
|
96
|
+
/// 4. Returns PathBuf to pdfium directory
|
|
97
|
+
///
|
|
98
|
+
/// Reuses all existing helper functions:
|
|
99
|
+
/// - `get_pdfium_url_and_lib()` - determines download URL for target
|
|
100
|
+
/// - `download_and_extract_pdfium()` - downloads with retry logic
|
|
101
|
+
/// - `runtime_library_info()` - platform-specific library names
|
|
102
|
+
/// - `prepare_prebuilt_pdfium()` - handles prebuilt copy
|
|
103
|
+
fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
|
|
104
|
+
let (download_url, _lib_name) = get_pdfium_url_and_lib(target);
|
|
105
|
+
let pdfium_dir = out_dir.join("pdfium");
|
|
106
|
+
|
|
107
|
+
if let Some(prebuilt) = env::var_os("KREUZBERG_PDFIUM_PREBUILT") {
|
|
108
|
+
let prebuilt_path = PathBuf::from(prebuilt);
|
|
109
|
+
if prebuilt_path.exists() {
|
|
110
|
+
prepare_prebuilt_pdfium(&prebuilt_path, &pdfium_dir)
|
|
111
|
+
.unwrap_or_else(|err| panic!("Failed to copy Pdfium from {}: {}", prebuilt_path.display(), err));
|
|
112
|
+
if target.contains("windows") {
|
|
113
|
+
ensure_windows_import_library(&pdfium_dir);
|
|
114
|
+
}
|
|
115
|
+
return pdfium_dir;
|
|
116
|
+
} else {
|
|
117
|
+
panic!(
|
|
118
|
+
"Environment variable KREUZBERG_PDFIUM_PREBUILT points to '{}' but the directory does not exist",
|
|
119
|
+
prebuilt_path.display()
|
|
120
|
+
);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
|
|
125
|
+
let lib_found = find_pdfium_library(&pdfium_dir, &runtime_lib_name, runtime_subdir).is_ok();
|
|
126
|
+
|
|
127
|
+
let import_lib_exists = if target.contains("windows") {
|
|
128
|
+
let lib_dir = pdfium_dir.join("lib");
|
|
129
|
+
lib_dir.join("pdfium.lib").exists() || lib_dir.join("pdfium.dll.lib").exists()
|
|
130
|
+
} else {
|
|
131
|
+
true
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
if !lib_found || !import_lib_exists {
|
|
135
|
+
tracing::debug!("Pdfium library not found, downloading for target: {}", target);
|
|
136
|
+
tracing::debug!("Download URL: {}", download_url);
|
|
137
|
+
download_and_extract_pdfium(&download_url, &pdfium_dir);
|
|
138
|
+
} else {
|
|
139
|
+
tracing::debug!("Pdfium library already cached at {}", pdfium_dir.display());
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if target.contains("windows") {
|
|
143
|
+
ensure_windows_import_library(&pdfium_dir);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
pdfium_dir
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
fn ensure_windows_import_library(pdfium_dir: &Path) {
|
|
150
|
+
let lib_dir = pdfium_dir.join("lib");
|
|
151
|
+
let dll_lib = lib_dir.join("pdfium.dll.lib");
|
|
152
|
+
let expected_lib = lib_dir.join("pdfium.lib");
|
|
153
|
+
|
|
154
|
+
if dll_lib.exists() && !expected_lib.exists() {
|
|
155
|
+
tracing::debug!(
|
|
156
|
+
"Ensuring Windows import library at {} (source: {})",
|
|
157
|
+
expected_lib.display(),
|
|
158
|
+
dll_lib.display()
|
|
159
|
+
);
|
|
160
|
+
fs::copy(&dll_lib, &expected_lib).unwrap_or_else(|err| {
|
|
161
|
+
panic!(
|
|
162
|
+
"Failed to copy Windows import library from {} to {}: {}",
|
|
163
|
+
dll_lib.display(),
|
|
164
|
+
expected_lib.display(),
|
|
165
|
+
err
|
|
166
|
+
)
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/// Fetch the latest release version from a GitHub repository
|
|
172
|
+
///
|
|
173
|
+
/// Uses curl to query the GitHub API and extract the tag_name from the
|
|
174
|
+
/// latest release JSON response. Uses improved JSON parsing with fallback logic.
|
|
175
|
+
///
|
|
176
|
+
/// For WASM (paulocoutinhox/pdfium-lib), falls back to known stable versions.
|
|
177
|
+
/// For non-WASM (bblanchon/pdfium-binaries), uses a different fallback strategy.
|
|
178
|
+
fn get_latest_version(repo: &str) -> String {
|
|
179
|
+
let api_url = format!("https://api.github.com/repos/{}/releases/latest", repo);
|
|
180
|
+
|
|
181
|
+
let output = Command::new("curl").args(["-s", &api_url]).output();
|
|
182
|
+
|
|
183
|
+
if let Ok(output) = output
|
|
184
|
+
&& output.status.success()
|
|
185
|
+
{
|
|
186
|
+
let json = String::from_utf8_lossy(&output.stdout);
|
|
187
|
+
|
|
188
|
+
if let Some(tag) = extract_tag_from_json(&json) {
|
|
189
|
+
return tag;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
if repo.contains("paulocoutinhox") {
|
|
194
|
+
eprintln!(
|
|
195
|
+
"cargo:warning=Failed to fetch latest PDFium WASM version from GitHub API, using fallback version 7442b"
|
|
196
|
+
);
|
|
197
|
+
"7442b".to_string()
|
|
198
|
+
} else if repo.contains("bblanchon") {
|
|
199
|
+
eprintln!(
|
|
200
|
+
"cargo:warning=Failed to fetch latest PDFium binaries version from GitHub API, using fallback version 7568"
|
|
201
|
+
);
|
|
202
|
+
"7568".to_string()
|
|
203
|
+
} else {
|
|
204
|
+
eprintln!(
|
|
205
|
+
"cargo:warning=Failed to fetch latest PDFium version from GitHub API (unknown repository: {})",
|
|
206
|
+
repo
|
|
207
|
+
);
|
|
208
|
+
String::new()
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/// Extract tag_name from GitHub API JSON response
|
|
213
|
+
///
|
|
214
|
+
/// Parses JSON by finding the tag_name field and extracting the value between quotes.
|
|
215
|
+
/// Handles various JSON formatting variations.
|
|
216
|
+
fn extract_tag_from_json(json: &str) -> Option<String> {
|
|
217
|
+
if let Some(start) = json.find("\"tag_name\"") {
|
|
218
|
+
let after_colon = &json[start + "\"tag_name\"".len()..];
|
|
219
|
+
|
|
220
|
+
let after_colon = after_colon.trim_start();
|
|
221
|
+
let after_colon = after_colon.strip_prefix(':')?;
|
|
222
|
+
let after_colon = after_colon.trim_start();
|
|
223
|
+
|
|
224
|
+
if let Some(opening_quote) = after_colon.find('"') {
|
|
225
|
+
let value_start = opening_quote + 1;
|
|
226
|
+
if let Some(closing_quote) = after_colon[value_start..].find('"') {
|
|
227
|
+
let tag = &after_colon[value_start..value_start + closing_quote];
|
|
228
|
+
return Some(tag.split('/').next_back().unwrap_or(tag).to_string());
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
None
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/// Get the download URL and library name for the target platform
|
|
237
|
+
///
|
|
238
|
+
/// Determines platform/architecture from target triple and constructs
|
|
239
|
+
/// the appropriate GitHub release download URL. Supports:
|
|
240
|
+
/// - WASM: paulocoutinhox/pdfium-lib
|
|
241
|
+
/// - Other platforms: bblanchon/pdfium-binaries
|
|
242
|
+
fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
|
|
243
|
+
if target.contains("wasm") {
|
|
244
|
+
let version = env::var("PDFIUM_WASM_VERSION")
|
|
245
|
+
.ok()
|
|
246
|
+
.filter(|v| !v.is_empty())
|
|
247
|
+
.unwrap_or_else(|| get_latest_version("paulocoutinhox/pdfium-lib"));
|
|
248
|
+
tracing::debug!("Using pdfium-lib version: {}", version);
|
|
249
|
+
|
|
250
|
+
return (
|
|
251
|
+
format!(
|
|
252
|
+
"https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/wasm.tgz",
|
|
253
|
+
version
|
|
254
|
+
),
|
|
255
|
+
"pdfium".to_string(),
|
|
256
|
+
);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
let (platform, arch) = if target.contains("darwin") {
|
|
260
|
+
let arch = if target.contains("aarch64") { "arm64" } else { "x64" };
|
|
261
|
+
("mac", arch)
|
|
262
|
+
} else if target.contains("linux") {
|
|
263
|
+
let arch = if target.contains("aarch64") {
|
|
264
|
+
"arm64"
|
|
265
|
+
} else if target.contains("arm") {
|
|
266
|
+
"arm"
|
|
267
|
+
} else {
|
|
268
|
+
"x64"
|
|
269
|
+
};
|
|
270
|
+
("linux", arch)
|
|
271
|
+
} else if target.contains("windows") {
|
|
272
|
+
let arch = if target.contains("aarch64") {
|
|
273
|
+
"arm64"
|
|
274
|
+
} else if target.contains("i686") {
|
|
275
|
+
"x86"
|
|
276
|
+
} else {
|
|
277
|
+
"x64"
|
|
278
|
+
};
|
|
279
|
+
("win", arch)
|
|
280
|
+
} else {
|
|
281
|
+
panic!("Unsupported target platform: {}", target);
|
|
282
|
+
};
|
|
283
|
+
|
|
284
|
+
let version = env::var("PDFIUM_VERSION")
|
|
285
|
+
.ok()
|
|
286
|
+
.filter(|v| !v.is_empty())
|
|
287
|
+
.unwrap_or_else(|| get_latest_version("bblanchon/pdfium-binaries"));
|
|
288
|
+
tracing::debug!("Using pdfium-binaries version: {}", version);
|
|
289
|
+
|
|
290
|
+
let url = format!(
|
|
291
|
+
"https://github.com/bblanchon/pdfium-binaries/releases/download/chromium/{}/pdfium-{}-{}.tgz",
|
|
292
|
+
version, platform, arch
|
|
293
|
+
);
|
|
294
|
+
|
|
295
|
+
(url, "pdfium".to_string())
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/// Download and extract PDFium archive with retry logic
|
|
299
|
+
///
|
|
300
|
+
/// Features:
|
|
301
|
+
/// - Exponential backoff retry (configurable via env vars)
|
|
302
|
+
/// - File type validation (gzip check)
|
|
303
|
+
/// - Windows-specific import library handling (pdfium.dll.lib -> pdfium.lib)
|
|
304
|
+
/// - Environment variables:
|
|
305
|
+
/// - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: number of retries (default: 5)
|
|
306
|
+
/// - KREUZBERG_PDFIUM_DOWNLOAD_BACKOFF_SECS: initial backoff in seconds (default: 2)
|
|
307
|
+
fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
|
|
308
|
+
fs::create_dir_all(dest_dir).expect("Failed to create pdfium directory");
|
|
309
|
+
|
|
310
|
+
let archive_path = dest_dir.join("pdfium.tar.gz");
|
|
311
|
+
let retries = env::var("KREUZBERG_PDFIUM_DOWNLOAD_RETRIES")
|
|
312
|
+
.ok()
|
|
313
|
+
.and_then(|value| value.parse::<u32>().ok())
|
|
314
|
+
.filter(|value| *value > 0)
|
|
315
|
+
.unwrap_or(5);
|
|
316
|
+
let base_delay = env::var("KREUZBERG_PDFIUM_DOWNLOAD_BACKOFF_SECS")
|
|
317
|
+
.ok()
|
|
318
|
+
.and_then(|value| value.parse::<u64>().ok())
|
|
319
|
+
.filter(|value| *value > 0)
|
|
320
|
+
.unwrap_or(2);
|
|
321
|
+
|
|
322
|
+
let archive_path_str = archive_path
|
|
323
|
+
.to_str()
|
|
324
|
+
.unwrap_or_else(|| panic!("Non-UTF8 path for archive: {}", archive_path.display()));
|
|
325
|
+
let mut last_error = String::new();
|
|
326
|
+
|
|
327
|
+
for attempt in 1..=retries {
|
|
328
|
+
let _ = fs::remove_file(&archive_path);
|
|
329
|
+
tracing::debug!(
|
|
330
|
+
"Downloading Pdfium archive from: {} (attempt {}/{})",
|
|
331
|
+
url,
|
|
332
|
+
attempt,
|
|
333
|
+
retries
|
|
334
|
+
);
|
|
335
|
+
|
|
336
|
+
let status = Command::new("curl")
|
|
337
|
+
.args(["-f", "-L", "-o", archive_path_str, url])
|
|
338
|
+
.status();
|
|
339
|
+
|
|
340
|
+
match status {
|
|
341
|
+
Ok(code) if code.success() => {
|
|
342
|
+
last_error.clear();
|
|
343
|
+
break;
|
|
344
|
+
}
|
|
345
|
+
Ok(code) => {
|
|
346
|
+
last_error = format!("curl exited with {:?}", code.code());
|
|
347
|
+
}
|
|
348
|
+
Err(err) => {
|
|
349
|
+
last_error = format!("failed to spawn curl: {err}");
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if attempt == retries {
|
|
354
|
+
panic!(
|
|
355
|
+
"Failed to download Pdfium from {} after {} attempts. Last error: {}",
|
|
356
|
+
url, retries, last_error
|
|
357
|
+
);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
let exponent = u32::min(attempt, 5);
|
|
361
|
+
let multiplier = 1u64 << exponent;
|
|
362
|
+
let delay_secs = base_delay.saturating_mul(multiplier).min(30);
|
|
363
|
+
println!(
|
|
364
|
+
"cargo:warning=Pdfium download failed (attempt {}/{}) - {}. Retrying in {}s",
|
|
365
|
+
attempt, retries, last_error, delay_secs
|
|
366
|
+
);
|
|
367
|
+
thread::sleep(Duration::from_secs(delay_secs));
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
// Validate gzip magic bytes (0x1f 0x8b) instead of using external 'file' command
|
|
371
|
+
// This is more portable and works correctly on Windows
|
|
372
|
+
let is_valid_gzip = fs::read(&archive_path)
|
|
373
|
+
.map(|bytes| bytes.len() >= 2 && bytes[0] == 0x1f && bytes[1] == 0x8b)
|
|
374
|
+
.unwrap_or(false);
|
|
375
|
+
|
|
376
|
+
if !is_valid_gzip {
|
|
377
|
+
fs::remove_file(&archive_path).ok();
|
|
378
|
+
panic!(
|
|
379
|
+
"Downloaded file is not a valid gzip archive. URL may be incorrect or version unavailable: {}",
|
|
380
|
+
url
|
|
381
|
+
);
|
|
382
|
+
}
|
|
383
|
+
tracing::debug!("Downloaded file validated as gzip archive");
|
|
384
|
+
|
|
385
|
+
tracing::debug!("Extracting Pdfium archive...");
|
|
386
|
+
let status = Command::new("tar")
|
|
387
|
+
.args(["-xzf", archive_path.to_str().unwrap(), "-C", dest_dir.to_str().unwrap()])
|
|
388
|
+
.status()
|
|
389
|
+
.expect("Failed to execute tar");
|
|
390
|
+
|
|
391
|
+
if !status.success() {
|
|
392
|
+
fs::remove_file(&archive_path).ok();
|
|
393
|
+
panic!("Failed to extract Pdfium archive from {}", url);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
fs::remove_file(&archive_path).ok();
|
|
397
|
+
|
|
398
|
+
let target = env::var("TARGET").unwrap();
|
|
399
|
+
if target.contains("windows") {
|
|
400
|
+
let lib_dir = dest_dir.join("lib");
|
|
401
|
+
let dll_lib = lib_dir.join("pdfium.dll.lib");
|
|
402
|
+
let expected_lib = lib_dir.join("pdfium.lib");
|
|
403
|
+
|
|
404
|
+
if dll_lib.exists() {
|
|
405
|
+
tracing::debug!("Ensuring Windows import library at {}", expected_lib.display());
|
|
406
|
+
if let Err(err) = fs::copy(&dll_lib, &expected_lib) {
|
|
407
|
+
panic!("Failed to copy pdfium.dll.lib to pdfium.lib: {err}");
|
|
408
|
+
}
|
|
409
|
+
} else {
|
|
410
|
+
tracing::debug!("Warning: Expected {} not found after extraction", dll_lib.display());
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
tracing::debug!("Pdfium downloaded and extracted successfully");
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
/// Prepare prebuilt PDFium by copying to destination directory
|
|
418
|
+
///
|
|
419
|
+
/// Removes existing destination if present, then recursively copies
|
|
420
|
+
/// all files from prebuilt source to destination.
|
|
421
|
+
fn prepare_prebuilt_pdfium(prebuilt_src: &Path, dest_dir: &Path) -> io::Result<()> {
|
|
422
|
+
if dest_dir.exists() {
|
|
423
|
+
fs::remove_dir_all(dest_dir)?;
|
|
424
|
+
}
|
|
425
|
+
copy_dir_all(prebuilt_src, dest_dir)
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
/// Recursively copy directory tree
|
|
429
|
+
///
|
|
430
|
+
/// Used by `prepare_prebuilt_pdfium()` to copy entire pdfium directory
|
|
431
|
+
/// structure, preserving all files and subdirectories.
|
|
432
|
+
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
|
433
|
+
fs::create_dir_all(dst)?;
|
|
434
|
+
for entry in fs::read_dir(src)? {
|
|
435
|
+
let entry = entry?;
|
|
436
|
+
let file_type = entry.file_type()?;
|
|
437
|
+
let target_path = dst.join(entry.file_name());
|
|
438
|
+
if file_type.is_dir() {
|
|
439
|
+
copy_dir_all(&entry.path(), &target_path)?;
|
|
440
|
+
} else {
|
|
441
|
+
fs::copy(entry.path(), &target_path)?;
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
Ok(())
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
/// Get platform-specific runtime library name and subdirectory
|
|
448
|
+
///
|
|
449
|
+
/// Returns tuple of (library_name, subdirectory) for the target platform:
|
|
450
|
+
/// - WASM: ("libpdfium.a", "release/lib")
|
|
451
|
+
/// - Windows: ("pdfium.dll", "bin")
|
|
452
|
+
/// - macOS: ("libpdfium.dylib", "lib")
|
|
453
|
+
/// - Linux: ("libpdfium.so", "lib")
|
|
454
|
+
fn runtime_library_info(target: &str) -> (String, &'static str) {
|
|
455
|
+
if target.contains("wasm") {
|
|
456
|
+
("libpdfium.a".to_string(), "release/lib")
|
|
457
|
+
} else if target.contains("windows") {
|
|
458
|
+
("pdfium.dll".to_string(), "bin")
|
|
459
|
+
} else if target.contains("darwin") {
|
|
460
|
+
("libpdfium.dylib".to_string(), "lib")
|
|
461
|
+
} else {
|
|
462
|
+
("libpdfium.so".to_string(), "lib")
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
/// Find PDFium library in archive with flexible directory detection
|
|
467
|
+
///
|
|
468
|
+
/// Attempts to locate the library at multiple possible locations:
|
|
469
|
+
/// - {subdir}/{lib_name} (standard location)
|
|
470
|
+
/// - {lib_name} (root of archive)
|
|
471
|
+
/// - bin/{lib_name} (alternative location)
|
|
472
|
+
/// - lib/{lib_name} (explicit lib directory)
|
|
473
|
+
///
|
|
474
|
+
/// This handles variations in archive structure across different platform builds,
|
|
475
|
+
/// particularly macOS ARM64 where the archive structure may differ.
|
|
476
|
+
///
|
|
477
|
+
/// Returns the full path to the library if found, or an error with available files.
|
|
478
|
+
fn find_pdfium_library(pdfium_dir: &Path, lib_name: &str, expected_subdir: &str) -> Result<PathBuf, String> {
|
|
479
|
+
let candidates = [
|
|
480
|
+
pdfium_dir.join(expected_subdir).join(lib_name),
|
|
481
|
+
pdfium_dir.join(lib_name),
|
|
482
|
+
pdfium_dir.join("bin").join(lib_name),
|
|
483
|
+
pdfium_dir.join("lib").join(lib_name),
|
|
484
|
+
];
|
|
485
|
+
|
|
486
|
+
for candidate in &candidates {
|
|
487
|
+
if candidate.exists() {
|
|
488
|
+
tracing::debug!("Found PDFium library at: {}", candidate.display());
|
|
489
|
+
return Ok(candidate.clone());
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
let mut error_msg = format!(
|
|
494
|
+
"PDFium library not found at expected location: {}/{}\n\n",
|
|
495
|
+
pdfium_dir.display(),
|
|
496
|
+
expected_subdir
|
|
497
|
+
);
|
|
498
|
+
error_msg.push_str("Attempted locations:\n");
|
|
499
|
+
for candidate in &candidates {
|
|
500
|
+
error_msg.push_str(&format!(" - {}\n", candidate.display()));
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
error_msg.push_str("\nActual archive contents:\n");
|
|
504
|
+
if let Ok(entries) = fs::read_dir(pdfium_dir) {
|
|
505
|
+
for entry in entries.flatten() {
|
|
506
|
+
let path = entry.path();
|
|
507
|
+
let file_type = if path.is_dir() { "dir" } else { "file" };
|
|
508
|
+
error_msg.push_str(&format!(" {} ({})\n", path.display(), file_type));
|
|
509
|
+
|
|
510
|
+
if path.is_dir()
|
|
511
|
+
&& let Ok(sub_entries) = fs::read_dir(&path)
|
|
512
|
+
{
|
|
513
|
+
for sub_entry in sub_entries.flatten() {
|
|
514
|
+
let sub_path = sub_entry.path();
|
|
515
|
+
let sub_type = if sub_path.is_dir() { "dir" } else { "file" };
|
|
516
|
+
error_msg.push_str(&format!(" {} ({})\n", sub_path.display(), sub_type));
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
Err(error_msg)
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
/// Link PDFium dynamically (default)
|
|
526
|
+
///
|
|
527
|
+
/// Sets up linker to use PDFium as a dynamic library (.dylib/.so/.dll)
|
|
528
|
+
/// with platform-specific rpath configuration for runtime library discovery.
|
|
529
|
+
/// Supports flexible archive structures by adding multiple possible lib directories.
|
|
530
|
+
fn link_dynamically(pdfium_dir: &Path, target: &str) {
|
|
531
|
+
let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
|
|
532
|
+
|
|
533
|
+
let lib_path = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
|
|
534
|
+
Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
|
|
535
|
+
Err(err) => panic!("{}", err),
|
|
536
|
+
};
|
|
537
|
+
|
|
538
|
+
println!("cargo:rustc-link-search=native={}", lib_path.display());
|
|
539
|
+
println!("cargo:rustc-link-lib=dylib=pdfium");
|
|
540
|
+
|
|
541
|
+
let std_lib_dir = pdfium_dir.join("lib");
|
|
542
|
+
if std_lib_dir.exists() && std_lib_dir != lib_path {
|
|
543
|
+
println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
let bin_dir = pdfium_dir.join("bin");
|
|
547
|
+
if bin_dir.exists() && bin_dir != lib_path {
|
|
548
|
+
println!("cargo:rustc-link-search=native={}", bin_dir.display());
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
if target.contains("darwin") {
|
|
552
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
553
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
|
|
554
|
+
} else if target.contains("linux") {
|
|
555
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
556
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
/// Link PDFium statically (static-pdfium feature)
|
|
561
|
+
///
|
|
562
|
+
/// Embeds PDFium into the binary as a static library. Adds system
|
|
563
|
+
/// dependencies required for static linking on Linux.
|
|
564
|
+
/// Supports flexible archive structures by finding library in multiple locations.
|
|
565
|
+
///
|
|
566
|
+
/// Environment Variables:
|
|
567
|
+
/// - `PDFIUM_STATIC_LIB_PATH`: Path to directory containing libpdfium.a (for Docker/musl builds)
|
|
568
|
+
///
|
|
569
|
+
/// Note: bblanchon/pdfium-binaries only provides dynamic libraries.
|
|
570
|
+
/// On macOS, this will fallback to dynamic linking with a warning.
|
|
571
|
+
/// On Linux, you must provide PDFIUM_STATIC_LIB_PATH pointing to a static build.
|
|
572
|
+
fn link_statically(pdfium_dir: &Path, target: &str) {
|
|
573
|
+
let static_lib_name = "libpdfium.a";
|
|
574
|
+
let lib_subdir = if target.contains("wasm") { "release/lib" } else { "lib" };
|
|
575
|
+
|
|
576
|
+
if let Ok(custom_path) = env::var("PDFIUM_STATIC_LIB_PATH") {
|
|
577
|
+
let custom_lib_dir = PathBuf::from(&custom_path);
|
|
578
|
+
|
|
579
|
+
if !custom_lib_dir.exists() {
|
|
580
|
+
panic!(
|
|
581
|
+
"PDFIUM_STATIC_LIB_PATH points to '{}' but the directory does not exist",
|
|
582
|
+
custom_path
|
|
583
|
+
);
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
let custom_lib = custom_lib_dir.join(static_lib_name);
|
|
587
|
+
if !custom_lib.exists() {
|
|
588
|
+
panic!(
|
|
589
|
+
"PDFIUM_STATIC_LIB_PATH points to '{}' but {} not found.\n\
|
|
590
|
+
Expected to find: {}",
|
|
591
|
+
custom_path,
|
|
592
|
+
static_lib_name,
|
|
593
|
+
custom_lib.display()
|
|
594
|
+
);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
tracing::debug!("Using custom static PDFium from: {}", custom_lib.display());
|
|
598
|
+
println!("cargo:rustc-link-search=native={}", custom_lib_dir.display());
|
|
599
|
+
println!("cargo:rustc-link-lib=static=pdfium");
|
|
600
|
+
|
|
601
|
+
if target.contains("linux") {
|
|
602
|
+
println!("cargo:rustc-link-lib=dylib=pthread");
|
|
603
|
+
println!("cargo:rustc-link-lib=dylib=dl");
|
|
604
|
+
} else if target.contains("windows") {
|
|
605
|
+
println!("cargo:rustc-link-lib=dylib=ws2_32");
|
|
606
|
+
println!("cargo:rustc-link-lib=dylib=userenv");
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
return;
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
let lib_path = match find_pdfium_library(pdfium_dir, static_lib_name, lib_subdir) {
|
|
613
|
+
Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
|
|
614
|
+
Err(_err) => {
|
|
615
|
+
if target.contains("darwin") {
|
|
616
|
+
eprintln!("cargo:warning=Static PDFium library (libpdfium.a) not found for macOS.");
|
|
617
|
+
eprintln!("cargo:warning=bblanchon/pdfium-binaries only provides dynamic libraries.");
|
|
618
|
+
eprintln!("cargo:warning=Falling back to dynamic linking for local development.");
|
|
619
|
+
eprintln!("cargo:warning=Production Linux builds require PDFIUM_STATIC_LIB_PATH.");
|
|
620
|
+
|
|
621
|
+
link_dynamically(pdfium_dir, target);
|
|
622
|
+
return;
|
|
623
|
+
} else {
|
|
624
|
+
panic!(
|
|
625
|
+
"Static PDFium library (libpdfium.a) not found.\n\n\
|
|
626
|
+
bblanchon/pdfium-binaries only provides dynamic libraries.\n\n\
|
|
627
|
+
For static linking (required for Docker with musl), you must:\n\n\
|
|
628
|
+
1. Build static PDFium or obtain from a source that provides it\n\
|
|
629
|
+
- See: https://github.com/ajrcarey/pdfium-render/issues/53\n\
|
|
630
|
+
- Or use: https://github.com/paulocoutinhox/pdfium-lib (provides static builds)\n\n\
|
|
631
|
+
2. Set environment variable pointing to the directory containing libpdfium.a:\n\
|
|
632
|
+
export PDFIUM_STATIC_LIB_PATH=/path/to/pdfium/lib\n\n\
|
|
633
|
+
3. Or use alternative features:\n\
|
|
634
|
+
- 'pdf' (dynamic linking, requires .so at runtime)\n\
|
|
635
|
+
- 'bundled-pdfium' (embeds dynamic library in binary)\n\
|
|
636
|
+
- 'system-pdfium' (use system-installed pdfium)\n\n\
|
|
637
|
+
Example Dockerfile pattern:\n\
|
|
638
|
+
FROM alpine:latest as pdfium-builder\n\
|
|
639
|
+
# Download/build static libpdfium.a\n\
|
|
640
|
+
\n\
|
|
641
|
+
FROM rust:alpine as builder\n\
|
|
642
|
+
ENV PDFIUM_STATIC_LIB_PATH=/pdfium/lib\n\
|
|
643
|
+
COPY --from=pdfium-builder /path/to/libpdfium.a /pdfium/lib/"
|
|
644
|
+
);
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
};
|
|
648
|
+
|
|
649
|
+
println!("cargo:rustc-link-search=native={}", lib_path.display());
|
|
650
|
+
println!("cargo:rustc-link-lib=static=pdfium");
|
|
651
|
+
|
|
652
|
+
let std_lib_dir = pdfium_dir.join("lib");
|
|
653
|
+
if std_lib_dir.exists() && std_lib_dir != lib_path {
|
|
654
|
+
println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
let bin_dir = pdfium_dir.join("bin");
|
|
658
|
+
if bin_dir.exists() && bin_dir != lib_path {
|
|
659
|
+
println!("cargo:rustc-link-search=native={}", bin_dir.display());
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
if target.contains("linux") {
|
|
663
|
+
println!("cargo:rustc-link-lib=dylib=pthread");
|
|
664
|
+
println!("cargo:rustc-link-lib=dylib=dl");
|
|
665
|
+
} else if target.contains("windows") {
|
|
666
|
+
println!("cargo:rustc-link-lib=dylib=ws2_32");
|
|
667
|
+
println!("cargo:rustc-link-lib=dylib=userenv");
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
/// Link PDFium bundled (bundled-pdfium feature)
|
|
672
|
+
///
|
|
673
|
+
/// Links dynamically but copies library to OUT_DIR for embedding in binary.
|
|
674
|
+
/// Each binary extracts and uses its own copy of the PDFium library.
|
|
675
|
+
/// Supports flexible archive structures by finding library in multiple locations.
|
|
676
|
+
///
|
|
677
|
+
/// For WASM targets, links statically using the bundled static library.
|
|
678
|
+
fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
|
|
679
|
+
let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
|
|
680
|
+
let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
|
|
681
|
+
Ok(path) => path,
|
|
682
|
+
Err(err) => panic!("{}", err),
|
|
683
|
+
};
|
|
684
|
+
let bundled_lib = out_dir.join(&runtime_lib_name);
|
|
685
|
+
|
|
686
|
+
fs::copy(&src_lib, &bundled_lib)
|
|
687
|
+
.unwrap_or_else(|err| panic!("Failed to copy library to OUT_DIR for bundling: {}", err));
|
|
688
|
+
|
|
689
|
+
let bundled_path = bundled_lib
|
|
690
|
+
.to_str()
|
|
691
|
+
.unwrap_or_else(|| panic!("Non-UTF8 path for bundled library: {}", bundled_lib.display()));
|
|
692
|
+
println!("cargo:rustc-env=KREUZBERG_PDFIUM_BUNDLED_PATH={}", bundled_path);
|
|
693
|
+
|
|
694
|
+
if target.contains("wasm") {
|
|
695
|
+
let lib_dir = bundled_lib
|
|
696
|
+
.parent()
|
|
697
|
+
.unwrap_or_else(|| panic!("Invalid bundled library path: {}", bundled_lib.display()));
|
|
698
|
+
println!("cargo:rustc-link-search=native={}", lib_dir.display());
|
|
699
|
+
println!("cargo:rustc-link-lib=static=pdfium");
|
|
700
|
+
tracing::debug!("Bundled PDFium static library linked for WASM at: {}", bundled_path);
|
|
701
|
+
} else {
|
|
702
|
+
tracing::debug!("Bundled PDFium library at: {}", bundled_path);
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
/// Link system-installed PDFium (system-pdfium feature)
|
|
707
|
+
///
|
|
708
|
+
/// Attempts to find PDFium via pkg-config first, then falls back to
|
|
709
|
+
/// environment variables (KREUZBERG_PDFIUM_SYSTEM_PATH, KREUZBERG_PDFIUM_SYSTEM_INCLUDE).
|
|
710
|
+
fn link_system(_target: &str) {
|
|
711
|
+
match pkg_config::Config::new().atleast_version("5.0").probe("pdfium") {
|
|
712
|
+
Ok(library) => {
|
|
713
|
+
tracing::debug!("Found system pdfium via pkg-config");
|
|
714
|
+
for include_path in &library.include_paths {
|
|
715
|
+
println!("cargo:include={}", include_path.display());
|
|
716
|
+
}
|
|
717
|
+
return;
|
|
718
|
+
}
|
|
719
|
+
Err(err) => {
|
|
720
|
+
tracing::debug!("pkg-config probe failed: {}", err);
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
let lib_path = env::var("KREUZBERG_PDFIUM_SYSTEM_PATH").ok();
|
|
725
|
+
let include_path = env::var("KREUZBERG_PDFIUM_SYSTEM_INCLUDE").ok();
|
|
726
|
+
|
|
727
|
+
if let Some(lib_dir) = lib_path {
|
|
728
|
+
let lib_dir_path = PathBuf::from(&lib_dir);
|
|
729
|
+
if !lib_dir_path.exists() {
|
|
730
|
+
panic!(
|
|
731
|
+
"KREUZBERG_PDFIUM_SYSTEM_PATH points to '{}' but the directory does not exist",
|
|
732
|
+
lib_dir
|
|
733
|
+
);
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
println!("cargo:rustc-link-search=native={}", lib_dir);
|
|
737
|
+
println!("cargo:rustc-link-lib=dylib=pdfium");
|
|
738
|
+
|
|
739
|
+
if let Some(inc_dir) = include_path {
|
|
740
|
+
println!("cargo:include={}", inc_dir);
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
tracing::debug!("Using system pdfium from: {}", lib_dir);
|
|
744
|
+
return;
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
panic!(
|
|
748
|
+
"system-pdfium feature enabled but pdfium not found.\n\
|
|
749
|
+
\n\
|
|
750
|
+
Please install pdfium system-wide or provide:\n\
|
|
751
|
+
- KREUZBERG_PDFIUM_SYSTEM_PATH: path to directory containing libpdfium\n\
|
|
752
|
+
- KREUZBERG_PDFIUM_SYSTEM_INCLUDE: path to pdfium headers (optional)\n\
|
|
753
|
+
\n\
|
|
754
|
+
Alternatively, use a different linking strategy:\n\
|
|
755
|
+
- Default (dynamic): cargo build --features pdf\n\
|
|
756
|
+
- Static linking: cargo build --features pdf,static-pdfium\n\
|
|
757
|
+
- Bundled: cargo build --features pdf,bundled-pdfium"
|
|
758
|
+
);
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
/// Link system frameworks and standard libraries
|
|
762
|
+
///
|
|
763
|
+
/// Adds platform-specific system libraries required for PDFium linking:
|
|
764
|
+
/// - macOS: CoreFoundation, CoreGraphics, CoreText, AppKit, libc++
|
|
765
|
+
/// - Linux: stdc++, libm
|
|
766
|
+
/// - Windows: gdi32, user32, advapi32
|
|
767
|
+
fn link_system_frameworks(target: &str) {
|
|
768
|
+
if target.contains("darwin") {
|
|
769
|
+
println!("cargo:rustc-link-lib=framework=CoreFoundation");
|
|
770
|
+
println!("cargo:rustc-link-lib=framework=CoreGraphics");
|
|
771
|
+
println!("cargo:rustc-link-lib=framework=CoreText");
|
|
772
|
+
println!("cargo:rustc-link-lib=framework=AppKit");
|
|
773
|
+
println!("cargo:rustc-link-lib=dylib=c++");
|
|
774
|
+
} else if target.contains("linux") {
|
|
775
|
+
println!("cargo:rustc-link-lib=dylib=stdc++");
|
|
776
|
+
println!("cargo:rustc-link-lib=dylib=m");
|
|
777
|
+
} else if target.contains("windows") {
|
|
778
|
+
println!("cargo:rustc-link-lib=dylib=gdi32");
|
|
779
|
+
println!("cargo:rustc-link-lib=dylib=user32");
|
|
780
|
+
println!("cargo:rustc-link-lib=dylib=advapi32");
|
|
781
|
+
}
|
|
782
|
+
}
|