kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
data/vendor/kreuzberg/build.rs
CHANGED
|
@@ -6,102 +6,14 @@ use std::process::Command;
|
|
|
6
6
|
use std::thread;
|
|
7
7
|
use std::time::Duration;
|
|
8
8
|
|
|
9
|
-
/// PDFium linking strategy
|
|
10
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
11
|
-
enum PdfiumLinkStrategy {
|
|
12
|
-
/// Download and link statically (static-pdfium feature)
|
|
13
|
-
DownloadStatic,
|
|
14
|
-
/// Download, link dynamically, and embed in binary (bundled-pdfium feature)
|
|
15
|
-
Bundled,
|
|
16
|
-
/// Use system-installed pdfium via pkg-config (system-pdfium feature)
|
|
17
|
-
System,
|
|
18
|
-
}
|
|
19
|
-
|
|
20
9
|
fn main() {
|
|
21
10
|
let target = env::var("TARGET").unwrap();
|
|
22
11
|
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
|
|
23
12
|
|
|
24
13
|
println!("cargo::rustc-check-cfg=cfg(coverage)");
|
|
25
14
|
|
|
26
|
-
|
|
27
|
-
tracing::debug!("PDF feature not enabled, skipping pdfium linking");
|
|
28
|
-
return;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
let strategy = determine_link_strategy(&target);
|
|
32
|
-
|
|
33
|
-
tracing::debug!("Using PDFium linking strategy: {:?}", strategy);
|
|
34
|
-
|
|
35
|
-
match strategy {
|
|
36
|
-
PdfiumLinkStrategy::DownloadStatic => {
|
|
37
|
-
let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
|
|
38
|
-
link_statically(&pdfium_dir, &target);
|
|
39
|
-
}
|
|
40
|
-
PdfiumLinkStrategy::Bundled => {
|
|
41
|
-
let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
|
|
42
|
-
link_bundled(&pdfium_dir, &target, &out_dir);
|
|
43
|
-
}
|
|
44
|
-
PdfiumLinkStrategy::System => {
|
|
45
|
-
link_system(&target);
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
link_system_frameworks(&target);
|
|
50
|
-
println!("cargo:rerun-if-changed=build.rs");
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
/// Determine which linking strategy to use based on features and target
|
|
54
|
-
fn determine_link_strategy(target: &str) -> PdfiumLinkStrategy {
|
|
55
|
-
if target.contains("wasm") {
|
|
56
|
-
if let Ok(wasm_lib) = env::var("PDFIUM_WASM_LIB") {
|
|
57
|
-
println!("cargo:rustc-link-search=native={}", wasm_lib);
|
|
58
|
-
println!("cargo:rustc-link-lib=static=pdfium");
|
|
59
|
-
return PdfiumLinkStrategy::DownloadStatic;
|
|
60
|
-
}
|
|
61
|
-
println!("cargo:warning=WASM build using bundled PDFium (set PDFIUM_WASM_LIB to link custom WASM PDFium)");
|
|
62
|
-
return PdfiumLinkStrategy::Bundled;
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
let system_pdfium = cfg!(feature = "system-pdfium");
|
|
66
|
-
let bundled_pdfium = cfg!(feature = "bundled-pdfium");
|
|
67
|
-
let static_pdfium = cfg!(feature = "static-pdfium");
|
|
68
|
-
|
|
69
|
-
let enabled_count = usize::from(system_pdfium) + usize::from(bundled_pdfium) + usize::from(static_pdfium);
|
|
70
|
-
if enabled_count > 1 {
|
|
71
|
-
println!(
|
|
72
|
-
"cargo:warning=Multiple PDFium linking strategies enabled (static-pdfium={}, bundled-pdfium={}, system-pdfium={}); using bundled-pdfium for this build",
|
|
73
|
-
static_pdfium, bundled_pdfium, system_pdfium
|
|
74
|
-
);
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
if bundled_pdfium {
|
|
78
|
-
return PdfiumLinkStrategy::Bundled;
|
|
79
|
-
}
|
|
80
|
-
if system_pdfium {
|
|
81
|
-
return PdfiumLinkStrategy::System;
|
|
82
|
-
}
|
|
83
|
-
if static_pdfium {
|
|
84
|
-
return PdfiumLinkStrategy::DownloadStatic;
|
|
85
|
-
}
|
|
15
|
+
let (download_url, lib_name) = get_pdfium_url_and_lib(&target);
|
|
86
16
|
|
|
87
|
-
PdfiumLinkStrategy::Bundled
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
/// Download PDFium or use prebuilt directory
|
|
91
|
-
///
|
|
92
|
-
/// This is the main orchestrator function that:
|
|
93
|
-
/// 1. Checks for `KREUZBERG_PDFIUM_PREBUILT` environment variable
|
|
94
|
-
/// 2. If set and valid, uses prebuilt pdfium directory
|
|
95
|
-
/// 3. If not set, downloads pdfium to out_dir (with caching)
|
|
96
|
-
/// 4. Returns PathBuf to pdfium directory
|
|
97
|
-
///
|
|
98
|
-
/// Reuses all existing helper functions:
|
|
99
|
-
/// - `get_pdfium_url_and_lib()` - determines download URL for target
|
|
100
|
-
/// - `download_and_extract_pdfium()` - downloads with retry logic
|
|
101
|
-
/// - `runtime_library_info()` - platform-specific library names
|
|
102
|
-
/// - `prepare_prebuilt_pdfium()` - handles prebuilt copy
|
|
103
|
-
fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
|
|
104
|
-
let (download_url, _lib_name) = get_pdfium_url_and_lib(target);
|
|
105
17
|
let pdfium_dir = out_dir.join("pdfium");
|
|
106
18
|
|
|
107
19
|
if let Some(prebuilt) = env::var_os("KREUZBERG_PDFIUM_PREBUILT") {
|
|
@@ -109,10 +21,6 @@ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
|
|
|
109
21
|
if prebuilt_path.exists() {
|
|
110
22
|
prepare_prebuilt_pdfium(&prebuilt_path, &pdfium_dir)
|
|
111
23
|
.unwrap_or_else(|err| panic!("Failed to copy Pdfium from {}: {}", prebuilt_path.display(), err));
|
|
112
|
-
if target.contains("windows") {
|
|
113
|
-
ensure_windows_import_library(&pdfium_dir);
|
|
114
|
-
}
|
|
115
|
-
return pdfium_dir;
|
|
116
24
|
} else {
|
|
117
25
|
panic!(
|
|
118
26
|
"Environment variable KREUZBERG_PDFIUM_PREBUILT points to '{}' but the directory does not exist",
|
|
@@ -121,9 +29,8 @@ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
|
|
|
121
29
|
}
|
|
122
30
|
}
|
|
123
31
|
|
|
124
|
-
let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
|
|
125
|
-
let
|
|
126
|
-
|
|
32
|
+
let (runtime_lib_name, runtime_subdir) = runtime_library_info(&target);
|
|
33
|
+
let runtime_lib_path = pdfium_dir.join(runtime_subdir).join(&runtime_lib_name);
|
|
127
34
|
let import_lib_exists = if target.contains("windows") {
|
|
128
35
|
let lib_dir = pdfium_dir.join("lib");
|
|
129
36
|
lib_dir.join("pdfium.lib").exists() || lib_dir.join("pdfium.dll.lib").exists()
|
|
@@ -131,50 +38,57 @@ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
|
|
|
131
38
|
true
|
|
132
39
|
};
|
|
133
40
|
|
|
134
|
-
if !
|
|
41
|
+
if !runtime_lib_path.exists() || !import_lib_exists {
|
|
135
42
|
tracing::debug!("Pdfium library not found, downloading for target: {}", target);
|
|
136
43
|
tracing::debug!("Download URL: {}", download_url);
|
|
137
44
|
download_and_extract_pdfium(&download_url, &pdfium_dir);
|
|
138
45
|
} else {
|
|
139
|
-
tracing::debug!("Pdfium library already
|
|
46
|
+
tracing::debug!("Pdfium library already present at {}", runtime_lib_path.display());
|
|
140
47
|
}
|
|
141
48
|
|
|
142
49
|
if target.contains("windows") {
|
|
143
|
-
|
|
144
|
-
|
|
50
|
+
let lib_dir = pdfium_dir.join("lib");
|
|
51
|
+
let dll_lib = lib_dir.join("pdfium.dll.lib");
|
|
52
|
+
let expected_lib = lib_dir.join("pdfium.lib");
|
|
145
53
|
|
|
146
|
-
|
|
147
|
-
}
|
|
54
|
+
if dll_lib.exists() && !expected_lib.exists() {
|
|
55
|
+
tracing::debug!("Renaming cached {} to {}", dll_lib.display(), expected_lib.display());
|
|
56
|
+
fs::rename(&dll_lib, &expected_lib).expect("Failed to rename pdfium.dll.lib to pdfium.lib");
|
|
57
|
+
}
|
|
58
|
+
}
|
|
148
59
|
|
|
149
|
-
fn ensure_windows_import_library(pdfium_dir: &Path) {
|
|
150
60
|
let lib_dir = pdfium_dir.join("lib");
|
|
151
|
-
|
|
152
|
-
|
|
61
|
+
println!("cargo:rustc-link-search=native={}", lib_dir.display());
|
|
62
|
+
println!("cargo:rustc-link-lib=dylib={}", lib_name);
|
|
153
63
|
|
|
154
|
-
if
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
);
|
|
160
|
-
fs::copy(&dll_lib, &expected_lib).unwrap_or_else(|err| {
|
|
161
|
-
panic!(
|
|
162
|
-
"Failed to copy Windows import library from {} to {}: {}",
|
|
163
|
-
dll_lib.display(),
|
|
164
|
-
expected_lib.display(),
|
|
165
|
-
err
|
|
166
|
-
)
|
|
167
|
-
});
|
|
64
|
+
if target.contains("darwin") {
|
|
65
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
66
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
|
|
67
|
+
} else if target.contains("linux") {
|
|
68
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
69
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
|
|
168
70
|
}
|
|
71
|
+
|
|
72
|
+
copy_lib_to_package(&pdfium_dir, &target);
|
|
73
|
+
|
|
74
|
+
if target.contains("darwin") {
|
|
75
|
+
println!("cargo:rustc-link-lib=framework=CoreFoundation");
|
|
76
|
+
println!("cargo:rustc-link-lib=framework=CoreGraphics");
|
|
77
|
+
println!("cargo:rustc-link-lib=framework=CoreText");
|
|
78
|
+
println!("cargo:rustc-link-lib=framework=AppKit");
|
|
79
|
+
println!("cargo:rustc-link-lib=dylib=c++");
|
|
80
|
+
} else if target.contains("linux") {
|
|
81
|
+
println!("cargo:rustc-link-lib=dylib=stdc++");
|
|
82
|
+
println!("cargo:rustc-link-lib=dylib=m");
|
|
83
|
+
} else if target.contains("windows") {
|
|
84
|
+
println!("cargo:rustc-link-lib=dylib=gdi32");
|
|
85
|
+
println!("cargo:rustc-link-lib=dylib=user32");
|
|
86
|
+
println!("cargo:rustc-link-lib=dylib=advapi32");
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
println!("cargo:rerun-if-changed=build.rs");
|
|
169
90
|
}
|
|
170
91
|
|
|
171
|
-
/// Fetch the latest release version from a GitHub repository
|
|
172
|
-
///
|
|
173
|
-
/// Uses curl to query the GitHub API and extract the tag_name from the
|
|
174
|
-
/// latest release JSON response. Uses improved JSON parsing with fallback logic.
|
|
175
|
-
///
|
|
176
|
-
/// For WASM (paulocoutinhox/pdfium-lib), falls back to known stable versions.
|
|
177
|
-
/// For non-WASM (bblanchon/pdfium-binaries), uses a different fallback strategy.
|
|
178
92
|
fn get_latest_version(repo: &str) -> String {
|
|
179
93
|
let api_url = format!("https://api.github.com/repos/{}/releases/latest", repo);
|
|
180
94
|
|
|
@@ -184,61 +98,21 @@ fn get_latest_version(repo: &str) -> String {
|
|
|
184
98
|
&& output.status.success()
|
|
185
99
|
{
|
|
186
100
|
let json = String::from_utf8_lossy(&output.stdout);
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
"cargo:warning=Failed to fetch latest PDFium WASM version from GitHub API, using fallback version 7442b"
|
|
196
|
-
);
|
|
197
|
-
"7442b".to_string()
|
|
198
|
-
} else if repo.contains("bblanchon") {
|
|
199
|
-
eprintln!(
|
|
200
|
-
"cargo:warning=Failed to fetch latest PDFium binaries version from GitHub API, using fallback version 7568"
|
|
201
|
-
);
|
|
202
|
-
"7568".to_string()
|
|
203
|
-
} else {
|
|
204
|
-
eprintln!(
|
|
205
|
-
"cargo:warning=Failed to fetch latest PDFium version from GitHub API (unknown repository: {})",
|
|
206
|
-
repo
|
|
207
|
-
);
|
|
208
|
-
String::new()
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
/// Extract tag_name from GitHub API JSON response
|
|
213
|
-
///
|
|
214
|
-
/// Parses JSON by finding the tag_name field and extracting the value between quotes.
|
|
215
|
-
/// Handles various JSON formatting variations.
|
|
216
|
-
fn extract_tag_from_json(json: &str) -> Option<String> {
|
|
217
|
-
if let Some(start) = json.find("\"tag_name\"") {
|
|
218
|
-
let after_colon = &json[start + "\"tag_name\"".len()..];
|
|
219
|
-
|
|
220
|
-
let after_colon = after_colon.trim_start();
|
|
221
|
-
let after_colon = after_colon.strip_prefix(':')?;
|
|
222
|
-
let after_colon = after_colon.trim_start();
|
|
223
|
-
|
|
224
|
-
if let Some(opening_quote) = after_colon.find('"') {
|
|
225
|
-
let value_start = opening_quote + 1;
|
|
226
|
-
if let Some(closing_quote) = after_colon[value_start..].find('"') {
|
|
227
|
-
let tag = &after_colon[value_start..value_start + closing_quote];
|
|
228
|
-
return Some(tag.split('/').next_back().unwrap_or(tag).to_string());
|
|
101
|
+
if let Some(start) = json.find("\"tag_name\":") {
|
|
102
|
+
let after_colon = &json[start + "\"tag_name\":".len()..];
|
|
103
|
+
if let Some(opening_quote) = after_colon.find('"')
|
|
104
|
+
&& let Some(closing_quote) = after_colon[opening_quote + 1..].find('"')
|
|
105
|
+
{
|
|
106
|
+
let tag_start = opening_quote + 1;
|
|
107
|
+
let tag = &after_colon[tag_start..tag_start + closing_quote];
|
|
108
|
+
return tag.split('/').next_back().unwrap_or(tag).to_string();
|
|
229
109
|
}
|
|
230
110
|
}
|
|
231
111
|
}
|
|
232
112
|
|
|
233
|
-
|
|
113
|
+
"7529".to_string()
|
|
234
114
|
}
|
|
235
115
|
|
|
236
|
-
/// Get the download URL and library name for the target platform
|
|
237
|
-
///
|
|
238
|
-
/// Determines platform/architecture from target triple and constructs
|
|
239
|
-
/// the appropriate GitHub release download URL. Supports:
|
|
240
|
-
/// - WASM: paulocoutinhox/pdfium-lib
|
|
241
|
-
/// - Other platforms: bblanchon/pdfium-binaries
|
|
242
116
|
fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
|
|
243
117
|
if target.contains("wasm") {
|
|
244
118
|
let version = env::var("PDFIUM_WASM_VERSION")
|
|
@@ -247,10 +121,11 @@ fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
|
|
|
247
121
|
.unwrap_or_else(|| get_latest_version("paulocoutinhox/pdfium-lib"));
|
|
248
122
|
tracing::debug!("Using pdfium-lib version: {}", version);
|
|
249
123
|
|
|
124
|
+
let wasm_arch = if target.contains("wasm32") { "wasm32" } else { "wasm64" };
|
|
250
125
|
return (
|
|
251
126
|
format!(
|
|
252
|
-
"https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/
|
|
253
|
-
version
|
|
127
|
+
"https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/pdfium-{}.tar.gz",
|
|
128
|
+
version, wasm_arch
|
|
254
129
|
),
|
|
255
130
|
"pdfium".to_string(),
|
|
256
131
|
);
|
|
@@ -295,15 +170,6 @@ fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
|
|
|
295
170
|
(url, "pdfium".to_string())
|
|
296
171
|
}
|
|
297
172
|
|
|
298
|
-
/// Download and extract PDFium archive with retry logic
|
|
299
|
-
///
|
|
300
|
-
/// Features:
|
|
301
|
-
/// - Exponential backoff retry (configurable via env vars)
|
|
302
|
-
/// - File type validation (gzip check)
|
|
303
|
-
/// - Windows-specific import library handling (pdfium.dll.lib -> pdfium.lib)
|
|
304
|
-
/// - Environment variables:
|
|
305
|
-
/// - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: number of retries (default: 5)
|
|
306
|
-
/// - KREUZBERG_PDFIUM_DOWNLOAD_BACKOFF_SECS: initial backoff in seconds (default: 2)
|
|
307
173
|
fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
|
|
308
174
|
fs::create_dir_all(dest_dir).expect("Failed to create pdfium directory");
|
|
309
175
|
|
|
@@ -367,20 +233,21 @@ fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
|
|
|
367
233
|
thread::sleep(Duration::from_secs(delay_secs));
|
|
368
234
|
}
|
|
369
235
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
.
|
|
374
|
-
|
|
236
|
+
let file_type = Command::new("file")
|
|
237
|
+
.arg(archive_path.to_str().unwrap())
|
|
238
|
+
.output()
|
|
239
|
+
.expect("Failed to check file type");
|
|
240
|
+
|
|
241
|
+
let file_type_output = String::from_utf8_lossy(&file_type.stdout);
|
|
242
|
+
tracing::debug!("Downloaded file type: {}", file_type_output.trim());
|
|
375
243
|
|
|
376
|
-
if !
|
|
244
|
+
if !file_type_output.to_lowercase().contains("gzip") && !file_type_output.to_lowercase().contains("compressed") {
|
|
377
245
|
fs::remove_file(&archive_path).ok();
|
|
378
246
|
panic!(
|
|
379
247
|
"Downloaded file is not a valid gzip archive. URL may be incorrect or version unavailable: {}",
|
|
380
248
|
url
|
|
381
249
|
);
|
|
382
250
|
}
|
|
383
|
-
tracing::debug!("Downloaded file validated as gzip archive");
|
|
384
251
|
|
|
385
252
|
tracing::debug!("Extracting Pdfium archive...");
|
|
386
253
|
let status = Command::new("tar")
|
|
@@ -414,369 +281,180 @@ fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
|
|
|
414
281
|
tracing::debug!("Pdfium downloaded and extracted successfully");
|
|
415
282
|
}
|
|
416
283
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
284
|
+
fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
|
|
285
|
+
let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
|
|
286
|
+
let src_lib = pdfium_dir.join(runtime_subdir).join(&runtime_lib_name);
|
|
287
|
+
|
|
288
|
+
if !src_lib.exists() {
|
|
289
|
+
tracing::debug!("Source library not found: {}", src_lib.display());
|
|
290
|
+
return;
|
|
424
291
|
}
|
|
425
|
-
copy_dir_all(prebuilt_src, dest_dir)
|
|
426
|
-
}
|
|
427
292
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
293
|
+
// Fix install_name on macOS to use @rpath
|
|
294
|
+
if target.contains("darwin") {
|
|
295
|
+
fix_macos_install_name(&src_lib, &runtime_lib_name);
|
|
296
|
+
codesign_if_needed(target, &src_lib);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
|
|
300
|
+
let workspace_root = crate_dir.parent().unwrap().parent().unwrap();
|
|
301
|
+
|
|
302
|
+
// Copy to target directory for CLI binary
|
|
303
|
+
if let Ok(profile) = env::var("PROFILE") {
|
|
304
|
+
let target_dir = workspace_root.join("target").join(profile);
|
|
305
|
+
if target_dir.exists() {
|
|
306
|
+
copy_lib_if_needed(
|
|
307
|
+
&src_lib,
|
|
308
|
+
&target_dir.join(&runtime_lib_name),
|
|
309
|
+
"CLI target directory",
|
|
310
|
+
target,
|
|
311
|
+
);
|
|
442
312
|
}
|
|
443
313
|
}
|
|
444
|
-
Ok(())
|
|
445
|
-
}
|
|
446
314
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
if target.contains("wasm") {
|
|
456
|
-
("libpdfium.a".to_string(), "release/lib")
|
|
457
|
-
} else if target.contains("windows") {
|
|
458
|
-
("pdfium.dll".to_string(), "bin")
|
|
459
|
-
} else if target.contains("darwin") {
|
|
460
|
-
("libpdfium.dylib".to_string(), "lib")
|
|
315
|
+
let python_dest_dir = workspace_root.join("packages").join("python").join("kreuzberg");
|
|
316
|
+
if python_dest_dir.exists() {
|
|
317
|
+
copy_lib_if_needed(
|
|
318
|
+
&src_lib,
|
|
319
|
+
&python_dest_dir.join(&runtime_lib_name),
|
|
320
|
+
"Python package",
|
|
321
|
+
target,
|
|
322
|
+
);
|
|
461
323
|
} else {
|
|
462
|
-
("
|
|
463
|
-
}
|
|
464
|
-
}
|
|
465
|
-
|
|
466
|
-
/// Find PDFium library in archive with flexible directory detection
|
|
467
|
-
///
|
|
468
|
-
/// Attempts to locate the library at multiple possible locations:
|
|
469
|
-
/// - {subdir}/{lib_name} (standard location)
|
|
470
|
-
/// - {lib_name} (root of archive)
|
|
471
|
-
/// - bin/{lib_name} (alternative location)
|
|
472
|
-
/// - lib/{lib_name} (explicit lib directory)
|
|
473
|
-
///
|
|
474
|
-
/// This handles variations in archive structure across different platform builds,
|
|
475
|
-
/// particularly macOS ARM64 where the archive structure may differ.
|
|
476
|
-
///
|
|
477
|
-
/// Returns the full path to the library if found, or an error with available files.
|
|
478
|
-
fn find_pdfium_library(pdfium_dir: &Path, lib_name: &str, expected_subdir: &str) -> Result<PathBuf, String> {
|
|
479
|
-
let candidates = [
|
|
480
|
-
pdfium_dir.join(expected_subdir).join(lib_name),
|
|
481
|
-
pdfium_dir.join(lib_name),
|
|
482
|
-
pdfium_dir.join("bin").join(lib_name),
|
|
483
|
-
pdfium_dir.join("lib").join(lib_name),
|
|
484
|
-
];
|
|
485
|
-
|
|
486
|
-
for candidate in &candidates {
|
|
487
|
-
if candidate.exists() {
|
|
488
|
-
tracing::debug!("Found PDFium library at: {}", candidate.display());
|
|
489
|
-
return Ok(candidate.clone());
|
|
490
|
-
}
|
|
324
|
+
tracing::debug!("Python package directory not found, skipping Python library copy");
|
|
491
325
|
}
|
|
492
326
|
|
|
493
|
-
let
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
327
|
+
let node_dest_dir = workspace_root.join("crates").join("kreuzberg-node");
|
|
328
|
+
if node_dest_dir.exists() {
|
|
329
|
+
copy_lib_if_needed(
|
|
330
|
+
&src_lib,
|
|
331
|
+
&node_dest_dir.join(&runtime_lib_name),
|
|
332
|
+
"Node.js package",
|
|
333
|
+
target,
|
|
334
|
+
);
|
|
335
|
+
} else {
|
|
336
|
+
tracing::debug!("Node.js package directory not found, skipping Node library copy");
|
|
501
337
|
}
|
|
502
338
|
|
|
503
|
-
|
|
504
|
-
if
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
error_msg.push_str(&format!(" {} ({})\n", path.display(), file_type));
|
|
509
|
-
|
|
510
|
-
if path.is_dir()
|
|
511
|
-
&& let Ok(sub_entries) = fs::read_dir(&path)
|
|
512
|
-
{
|
|
513
|
-
for sub_entry in sub_entries.flatten() {
|
|
514
|
-
let sub_path = sub_entry.path();
|
|
515
|
-
let sub_type = if sub_path.is_dir() { "dir" } else { "file" };
|
|
516
|
-
error_msg.push_str(&format!(" {} ({})\n", sub_path.display(), sub_type));
|
|
517
|
-
}
|
|
518
|
-
}
|
|
519
|
-
}
|
|
339
|
+
let ruby_dest_dir = workspace_root.join("packages").join("ruby").join("lib");
|
|
340
|
+
if ruby_dest_dir.exists() {
|
|
341
|
+
copy_lib_if_needed(&src_lib, &ruby_dest_dir.join(&runtime_lib_name), "Ruby package", target);
|
|
342
|
+
} else {
|
|
343
|
+
tracing::debug!("Ruby package directory not found, skipping Ruby library copy");
|
|
520
344
|
}
|
|
521
|
-
|
|
522
|
-
Err(error_msg)
|
|
523
345
|
}
|
|
524
346
|
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
/// Sets up linker to use PDFium as a dynamic library (.dylib/.so/.dll)
|
|
528
|
-
/// with platform-specific rpath configuration for runtime library discovery.
|
|
529
|
-
/// Supports flexible archive structures by adding multiple possible lib directories.
|
|
530
|
-
fn link_dynamically(pdfium_dir: &Path, target: &str) {
|
|
531
|
-
let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
|
|
347
|
+
fn copy_lib_if_needed(src: &Path, dest: &Path, package_name: &str, target: &str) {
|
|
348
|
+
use std::fs;
|
|
532
349
|
|
|
533
|
-
let
|
|
534
|
-
|
|
535
|
-
|
|
350
|
+
let should_copy = if dest.exists() {
|
|
351
|
+
let src_metadata = fs::metadata(src).ok();
|
|
352
|
+
let dest_metadata = fs::metadata(dest).ok();
|
|
353
|
+
match (src_metadata, dest_metadata) {
|
|
354
|
+
(Some(src), Some(dest)) => src.modified().ok() > dest.modified().ok(),
|
|
355
|
+
_ => true,
|
|
356
|
+
}
|
|
357
|
+
} else {
|
|
358
|
+
true
|
|
536
359
|
};
|
|
537
360
|
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
361
|
+
if should_copy {
|
|
362
|
+
match fs::copy(src, dest) {
|
|
363
|
+
Ok(_) => {
|
|
364
|
+
tracing::debug!("Copied {} to {} ({})", src.display(), dest.display(), package_name);
|
|
365
|
+
codesign_if_needed(target, dest);
|
|
366
|
+
}
|
|
367
|
+
Err(e) => tracing::debug!("Failed to copy library to {}: {}", package_name, e),
|
|
368
|
+
}
|
|
544
369
|
}
|
|
370
|
+
}
|
|
545
371
|
|
|
546
|
-
|
|
547
|
-
if
|
|
548
|
-
|
|
372
|
+
fn codesign_if_needed(target: &str, binary: &Path) {
|
|
373
|
+
if !target.contains("apple-darwin") || !binary.exists() {
|
|
374
|
+
return;
|
|
549
375
|
}
|
|
550
376
|
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
377
|
+
let identity = env::var("KREUZBERG_CODESIGN_IDENTITY").unwrap_or_else(|_| "-".to_string());
|
|
378
|
+
let status = Command::new("codesign")
|
|
379
|
+
.arg("--force")
|
|
380
|
+
.arg("--timestamp=none")
|
|
381
|
+
.arg("--sign")
|
|
382
|
+
.arg(identity)
|
|
383
|
+
.arg(binary)
|
|
384
|
+
.status();
|
|
559
385
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
/// dependencies required for static linking on Linux.
|
|
564
|
-
/// Supports flexible archive structures by finding library in multiple locations.
|
|
565
|
-
///
|
|
566
|
-
/// Environment Variables:
|
|
567
|
-
/// - `PDFIUM_STATIC_LIB_PATH`: Path to directory containing libpdfium.a (for Docker/musl builds)
|
|
568
|
-
///
|
|
569
|
-
/// Note: bblanchon/pdfium-binaries only provides dynamic libraries.
|
|
570
|
-
/// On macOS, this will fallback to dynamic linking with a warning.
|
|
571
|
-
/// On Linux, you must provide PDFIUM_STATIC_LIB_PATH pointing to a static build.
|
|
572
|
-
fn link_statically(pdfium_dir: &Path, target: &str) {
|
|
573
|
-
let static_lib_name = "libpdfium.a";
|
|
574
|
-
let lib_subdir = if target.contains("wasm") { "release/lib" } else { "lib" };
|
|
575
|
-
|
|
576
|
-
if let Ok(custom_path) = env::var("PDFIUM_STATIC_LIB_PATH") {
|
|
577
|
-
let custom_lib_dir = PathBuf::from(&custom_path);
|
|
578
|
-
|
|
579
|
-
if !custom_lib_dir.exists() {
|
|
580
|
-
panic!(
|
|
581
|
-
"PDFIUM_STATIC_LIB_PATH points to '{}' but the directory does not exist",
|
|
582
|
-
custom_path
|
|
583
|
-
);
|
|
386
|
+
match status {
|
|
387
|
+
Ok(result) if result.success() => {
|
|
388
|
+
tracing::debug!("Codesigned {}", binary.display());
|
|
584
389
|
}
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
Expected to find: {}",
|
|
591
|
-
custom_path,
|
|
592
|
-
static_lib_name,
|
|
593
|
-
custom_lib.display()
|
|
390
|
+
Ok(result) => {
|
|
391
|
+
tracing::debug!(
|
|
392
|
+
"codesign exited with status {} while signing {}",
|
|
393
|
+
result,
|
|
394
|
+
binary.display()
|
|
594
395
|
);
|
|
595
396
|
}
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
println!("cargo:rustc-link-search=native={}", custom_lib_dir.display());
|
|
599
|
-
println!("cargo:rustc-link-lib=static=pdfium");
|
|
600
|
-
|
|
601
|
-
if target.contains("linux") {
|
|
602
|
-
println!("cargo:rustc-link-lib=dylib=pthread");
|
|
603
|
-
println!("cargo:rustc-link-lib=dylib=dl");
|
|
604
|
-
} else if target.contains("windows") {
|
|
605
|
-
println!("cargo:rustc-link-lib=dylib=ws2_32");
|
|
606
|
-
println!("cargo:rustc-link-lib=dylib=userenv");
|
|
397
|
+
Err(err) => {
|
|
398
|
+
tracing::debug!("Failed to run codesign for {}: {}", binary.display(), err);
|
|
607
399
|
}
|
|
608
|
-
|
|
609
|
-
return;
|
|
610
400
|
}
|
|
401
|
+
}
|
|
611
402
|
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
eprintln!("cargo:warning=Production Linux builds require PDFIUM_STATIC_LIB_PATH.");
|
|
620
|
-
|
|
621
|
-
link_dynamically(pdfium_dir, target);
|
|
622
|
-
return;
|
|
623
|
-
} else {
|
|
624
|
-
panic!(
|
|
625
|
-
"Static PDFium library (libpdfium.a) not found.\n\n\
|
|
626
|
-
bblanchon/pdfium-binaries only provides dynamic libraries.\n\n\
|
|
627
|
-
For static linking (required for Docker with musl), you must:\n\n\
|
|
628
|
-
1. Build static PDFium or obtain from a source that provides it\n\
|
|
629
|
-
- See: https://github.com/ajrcarey/pdfium-render/issues/53\n\
|
|
630
|
-
- Or use: https://github.com/paulocoutinhox/pdfium-lib (provides static builds)\n\n\
|
|
631
|
-
2. Set environment variable pointing to the directory containing libpdfium.a:\n\
|
|
632
|
-
export PDFIUM_STATIC_LIB_PATH=/path/to/pdfium/lib\n\n\
|
|
633
|
-
3. Or use alternative features:\n\
|
|
634
|
-
- 'pdf' (dynamic linking, requires .so at runtime)\n\
|
|
635
|
-
- 'bundled-pdfium' (embeds dynamic library in binary)\n\
|
|
636
|
-
- 'system-pdfium' (use system-installed pdfium)\n\n\
|
|
637
|
-
Example Dockerfile pattern:\n\
|
|
638
|
-
FROM alpine:latest as pdfium-builder\n\
|
|
639
|
-
# Download/build static libpdfium.a\n\
|
|
640
|
-
\n\
|
|
641
|
-
FROM rust:alpine as builder\n\
|
|
642
|
-
ENV PDFIUM_STATIC_LIB_PATH=/pdfium/lib\n\
|
|
643
|
-
COPY --from=pdfium-builder /path/to/libpdfium.a /pdfium/lib/"
|
|
644
|
-
);
|
|
645
|
-
}
|
|
646
|
-
}
|
|
647
|
-
};
|
|
648
|
-
|
|
649
|
-
println!("cargo:rustc-link-search=native={}", lib_path.display());
|
|
650
|
-
println!("cargo:rustc-link-lib=static=pdfium");
|
|
651
|
-
|
|
652
|
-
let std_lib_dir = pdfium_dir.join("lib");
|
|
653
|
-
if std_lib_dir.exists() && std_lib_dir != lib_path {
|
|
654
|
-
println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
|
|
403
|
+
fn runtime_library_info(target: &str) -> (String, &'static str) {
|
|
404
|
+
if target.contains("windows") {
|
|
405
|
+
("pdfium.dll".to_string(), "bin")
|
|
406
|
+
} else if target.contains("darwin") {
|
|
407
|
+
("libpdfium.dylib".to_string(), "lib")
|
|
408
|
+
} else {
|
|
409
|
+
("libpdfium.so".to_string(), "lib")
|
|
655
410
|
}
|
|
411
|
+
}
|
|
656
412
|
|
|
657
|
-
|
|
658
|
-
if
|
|
659
|
-
|
|
413
|
+
fn prepare_prebuilt_pdfium(prebuilt_src: &Path, dest_dir: &Path) -> io::Result<()> {
|
|
414
|
+
if dest_dir.exists() {
|
|
415
|
+
fs::remove_dir_all(dest_dir)?;
|
|
660
416
|
}
|
|
417
|
+
copy_dir_all(prebuilt_src, dest_dir)
|
|
418
|
+
}
|
|
661
419
|
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
420
|
+
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
|
421
|
+
fs::create_dir_all(dst)?;
|
|
422
|
+
for entry in fs::read_dir(src)? {
|
|
423
|
+
let entry = entry?;
|
|
424
|
+
let file_type = entry.file_type()?;
|
|
425
|
+
let target_path = dst.join(entry.file_name());
|
|
426
|
+
if file_type.is_dir() {
|
|
427
|
+
copy_dir_all(&entry.path(), &target_path)?;
|
|
428
|
+
} else {
|
|
429
|
+
fs::copy(entry.path(), &target_path)?;
|
|
430
|
+
}
|
|
668
431
|
}
|
|
432
|
+
Ok(())
|
|
669
433
|
}
|
|
670
434
|
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
/// Links dynamically but copies library to OUT_DIR for embedding in binary.
|
|
674
|
-
/// Each binary extracts and uses its own copy of the PDFium library.
|
|
675
|
-
/// Supports flexible archive structures by finding library in multiple locations.
|
|
676
|
-
///
|
|
677
|
-
/// For WASM targets, links statically using the bundled static library.
|
|
678
|
-
fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
|
|
679
|
-
let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
|
|
680
|
-
let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
|
|
681
|
-
Ok(path) => path,
|
|
682
|
-
Err(err) => panic!("{}", err),
|
|
683
|
-
};
|
|
684
|
-
let bundled_lib = out_dir.join(&runtime_lib_name);
|
|
435
|
+
fn fix_macos_install_name(lib_path: &Path, lib_name: &str) {
|
|
436
|
+
use std::process::Command;
|
|
685
437
|
|
|
686
|
-
|
|
687
|
-
|
|
438
|
+
// Change install_name from ./libpdfium.dylib to @rpath/libpdfium.dylib
|
|
439
|
+
let new_install_name = format!("@rpath/{}", lib_name);
|
|
688
440
|
|
|
689
|
-
|
|
690
|
-
.to_str()
|
|
691
|
-
.unwrap_or_else(|| panic!("Non-UTF8 path for bundled library: {}", bundled_lib.display()));
|
|
692
|
-
println!("cargo:rustc-env=KREUZBERG_PDFIUM_BUNDLED_PATH={}", bundled_path);
|
|
441
|
+
tracing::debug!("Fixing install_name for {} to {}", lib_path.display(), new_install_name);
|
|
693
442
|
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
println!("cargo:rustc-link-lib=static=pdfium");
|
|
700
|
-
tracing::debug!("Bundled PDFium static library linked for WASM at: {}", bundled_path);
|
|
701
|
-
} else {
|
|
702
|
-
tracing::debug!("Bundled PDFium library at: {}", bundled_path);
|
|
703
|
-
}
|
|
704
|
-
}
|
|
443
|
+
let status = Command::new("install_name_tool")
|
|
444
|
+
.arg("-id")
|
|
445
|
+
.arg(&new_install_name)
|
|
446
|
+
.arg(lib_path)
|
|
447
|
+
.status();
|
|
705
448
|
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
/// environment variables (KREUZBERG_PDFIUM_SYSTEM_PATH, KREUZBERG_PDFIUM_SYSTEM_INCLUDE).
|
|
710
|
-
fn link_system(_target: &str) {
|
|
711
|
-
match pkg_config::Config::new().atleast_version("5.0").probe("pdfium") {
|
|
712
|
-
Ok(library) => {
|
|
713
|
-
tracing::debug!("Found system pdfium via pkg-config");
|
|
714
|
-
for include_path in &library.include_paths {
|
|
715
|
-
println!("cargo:include={}", include_path.display());
|
|
716
|
-
}
|
|
717
|
-
return;
|
|
718
|
-
}
|
|
719
|
-
Err(err) => {
|
|
720
|
-
tracing::debug!("pkg-config probe failed: {}", err);
|
|
449
|
+
match status {
|
|
450
|
+
Ok(s) if s.success() => {
|
|
451
|
+
tracing::debug!("Successfully updated install_name");
|
|
721
452
|
}
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
let lib_path = env::var("KREUZBERG_PDFIUM_SYSTEM_PATH").ok();
|
|
725
|
-
let include_path = env::var("KREUZBERG_PDFIUM_SYSTEM_INCLUDE").ok();
|
|
726
|
-
|
|
727
|
-
if let Some(lib_dir) = lib_path {
|
|
728
|
-
let lib_dir_path = PathBuf::from(&lib_dir);
|
|
729
|
-
if !lib_dir_path.exists() {
|
|
730
|
-
panic!(
|
|
731
|
-
"KREUZBERG_PDFIUM_SYSTEM_PATH points to '{}' but the directory does not exist",
|
|
732
|
-
lib_dir
|
|
733
|
-
);
|
|
453
|
+
Ok(s) => {
|
|
454
|
+
tracing::debug!("install_name_tool failed with status: {}", s);
|
|
734
455
|
}
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
println!("cargo:rustc-link-lib=dylib=pdfium");
|
|
738
|
-
|
|
739
|
-
if let Some(inc_dir) = include_path {
|
|
740
|
-
println!("cargo:include={}", inc_dir);
|
|
456
|
+
Err(e) => {
|
|
457
|
+
tracing::debug!("Failed to run install_name_tool: {}", e);
|
|
741
458
|
}
|
|
742
|
-
|
|
743
|
-
tracing::debug!("Using system pdfium from: {}", lib_dir);
|
|
744
|
-
return;
|
|
745
|
-
}
|
|
746
|
-
|
|
747
|
-
panic!(
|
|
748
|
-
"system-pdfium feature enabled but pdfium not found.\n\
|
|
749
|
-
\n\
|
|
750
|
-
Please install pdfium system-wide or provide:\n\
|
|
751
|
-
- KREUZBERG_PDFIUM_SYSTEM_PATH: path to directory containing libpdfium\n\
|
|
752
|
-
- KREUZBERG_PDFIUM_SYSTEM_INCLUDE: path to pdfium headers (optional)\n\
|
|
753
|
-
\n\
|
|
754
|
-
Alternatively, use a different linking strategy:\n\
|
|
755
|
-
- Default (dynamic): cargo build --features pdf\n\
|
|
756
|
-
- Static linking: cargo build --features pdf,static-pdfium\n\
|
|
757
|
-
- Bundled: cargo build --features pdf,bundled-pdfium"
|
|
758
|
-
);
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
/// Link system frameworks and standard libraries
|
|
762
|
-
///
|
|
763
|
-
/// Adds platform-specific system libraries required for PDFium linking:
|
|
764
|
-
/// - macOS: CoreFoundation, CoreGraphics, CoreText, AppKit, libc++
|
|
765
|
-
/// - Linux: stdc++, libm
|
|
766
|
-
/// - Windows: gdi32, user32, advapi32
|
|
767
|
-
fn link_system_frameworks(target: &str) {
|
|
768
|
-
if target.contains("darwin") {
|
|
769
|
-
println!("cargo:rustc-link-lib=framework=CoreFoundation");
|
|
770
|
-
println!("cargo:rustc-link-lib=framework=CoreGraphics");
|
|
771
|
-
println!("cargo:rustc-link-lib=framework=CoreText");
|
|
772
|
-
println!("cargo:rustc-link-lib=framework=AppKit");
|
|
773
|
-
println!("cargo:rustc-link-lib=dylib=c++");
|
|
774
|
-
} else if target.contains("linux") {
|
|
775
|
-
println!("cargo:rustc-link-lib=dylib=stdc++");
|
|
776
|
-
println!("cargo:rustc-link-lib=dylib=m");
|
|
777
|
-
} else if target.contains("windows") {
|
|
778
|
-
println!("cargo:rustc-link-lib=dylib=gdi32");
|
|
779
|
-
println!("cargo:rustc-link-lib=dylib=user32");
|
|
780
|
-
println!("cargo:rustc-link-lib=dylib=advapi32");
|
|
781
459
|
}
|
|
782
460
|
}
|