kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -19,74 +19,13 @@ use crate::plugins::DocumentExtractor;
|
|
|
19
19
|
use crate::types::ExtractionResult;
|
|
20
20
|
#[cfg(feature = "office")]
|
|
21
21
|
use crate::types::LibreOfficeConversionResult;
|
|
22
|
-
use crate::utils::{PoolSizeHint, estimate_pool_size, intern_mime_type};
|
|
23
22
|
use crate::{KreuzbergError, Result};
|
|
24
|
-
#[cfg(feature = "tokio-runtime")]
|
|
25
23
|
use once_cell::sync::Lazy;
|
|
26
24
|
#[cfg(feature = "office")]
|
|
27
25
|
use serde_json::json;
|
|
28
26
|
use std::path::Path;
|
|
29
27
|
use std::sync::Arc;
|
|
30
28
|
|
|
31
|
-
/// Record error information in the current OpenTelemetry span.
|
|
32
|
-
///
|
|
33
|
-
/// This function records error details in the current span when the `otel` feature is enabled.
|
|
34
|
-
/// It marks the span with `otel.status_code=ERROR` and adds error type and message fields.
|
|
35
|
-
///
|
|
36
|
-
/// # Arguments
|
|
37
|
-
///
|
|
38
|
-
/// * `error` - The error to record in the span
|
|
39
|
-
///
|
|
40
|
-
/// # Example
|
|
41
|
-
///
|
|
42
|
-
/// ```rust,ignore
|
|
43
|
-
/// let result = extract_file("doc.pdf", None, &config).await;
|
|
44
|
-
/// #[cfg(feature = "otel")]
|
|
45
|
-
/// if let Err(ref e) = result {
|
|
46
|
-
/// record_error(e);
|
|
47
|
-
/// }
|
|
48
|
-
/// result
|
|
49
|
-
/// ```
|
|
50
|
-
#[cfg(feature = "otel")]
|
|
51
|
-
fn record_error(error: &KreuzbergError) {
|
|
52
|
-
let span = tracing::Span::current();
|
|
53
|
-
span.record("otel.status_code", "ERROR");
|
|
54
|
-
span.record("error.type", format!("{:?}", error));
|
|
55
|
-
span.record("error.message", error.to_string());
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
/// Sanitize a file path to return only the filename.
|
|
59
|
-
///
|
|
60
|
-
/// This function extracts the filename from a path to avoid recording
|
|
61
|
-
/// potentially sensitive full file paths in telemetry data.
|
|
62
|
-
///
|
|
63
|
-
/// # Arguments
|
|
64
|
-
///
|
|
65
|
-
/// * `path` - The path to sanitize
|
|
66
|
-
///
|
|
67
|
-
/// # Returns
|
|
68
|
-
///
|
|
69
|
-
/// The filename as a string, or "unknown" if extraction fails
|
|
70
|
-
///
|
|
71
|
-
/// # Security
|
|
72
|
-
///
|
|
73
|
-
/// This prevents PII (personally identifiable information) from appearing in
|
|
74
|
-
/// traces by only recording filenames instead of full paths.
|
|
75
|
-
///
|
|
76
|
-
/// # Example
|
|
77
|
-
///
|
|
78
|
-
/// ```rust,ignore
|
|
79
|
-
/// let path = Path::new("/home/user/documents/secret.pdf");
|
|
80
|
-
/// assert_eq!(sanitize_path(path), "secret.pdf");
|
|
81
|
-
/// ```
|
|
82
|
-
#[cfg(feature = "otel")]
|
|
83
|
-
fn sanitize_path(path: &Path) -> String {
|
|
84
|
-
path.file_name()
|
|
85
|
-
.and_then(|n| n.to_str())
|
|
86
|
-
.unwrap_or("unknown")
|
|
87
|
-
.to_string()
|
|
88
|
-
}
|
|
89
|
-
|
|
90
29
|
/// Global Tokio runtime for synchronous operations.
|
|
91
30
|
///
|
|
92
31
|
/// This runtime is lazily initialized on first use and shared across all sync wrappers.
|
|
@@ -99,12 +38,6 @@ fn sanitize_path(path: &Path) -> String {
|
|
|
99
38
|
/// 2. If runtime creation fails, the process is already in a critical state
|
|
100
39
|
/// 3. This is a one-time initialization - if it fails, nothing will work
|
|
101
40
|
/// 4. Better to fail fast than return errors from every sync operation
|
|
102
|
-
///
|
|
103
|
-
/// # Availability
|
|
104
|
-
///
|
|
105
|
-
/// This static is only available when the `tokio-runtime` feature is enabled.
|
|
106
|
-
/// For WASM targets, use the truly synchronous extraction functions instead.
|
|
107
|
-
#[cfg(feature = "tokio-runtime")]
|
|
108
41
|
static GLOBAL_RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
|
|
109
42
|
tokio::runtime::Builder::new_multi_thread()
|
|
110
43
|
.enable_all()
|
|
@@ -129,34 +62,6 @@ fn get_extractor(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
|
|
|
129
62
|
registry_read.get(mime_type)
|
|
130
63
|
}
|
|
131
64
|
|
|
132
|
-
/// Get optimal pool sizing hint for a document.
|
|
133
|
-
///
|
|
134
|
-
/// This function calculates recommended pool sizes based on the document's
|
|
135
|
-
/// file size and MIME type. The hint can be used to create appropriately
|
|
136
|
-
/// sized thread pools for extraction, reducing memory waste from over-allocation.
|
|
137
|
-
///
|
|
138
|
-
/// # Arguments
|
|
139
|
-
///
|
|
140
|
-
/// * `file_size` - The size of the file in bytes
|
|
141
|
-
/// * `mime_type` - The MIME type of the document
|
|
142
|
-
///
|
|
143
|
-
/// # Returns
|
|
144
|
-
///
|
|
145
|
-
/// A `PoolSizeHint` with recommended pool configurations
|
|
146
|
-
///
|
|
147
|
-
/// # Example
|
|
148
|
-
///
|
|
149
|
-
/// ```rust,ignore
|
|
150
|
-
/// use kreuzberg::core::extractor::get_pool_sizing_hint;
|
|
151
|
-
///
|
|
152
|
-
/// let hint = get_pool_sizing_hint(5_000_000, "application/pdf");
|
|
153
|
-
/// println!("Recommended string buffers: {}", hint.string_buffer_count);
|
|
154
|
-
/// ```
|
|
155
|
-
#[inline]
|
|
156
|
-
pub fn get_pool_sizing_hint(file_size: u64, mime_type: &str) -> PoolSizeHint {
|
|
157
|
-
estimate_pool_size(file_size, mime_type)
|
|
158
|
-
}
|
|
159
|
-
|
|
160
65
|
/// Extract content from a file.
|
|
161
66
|
///
|
|
162
67
|
/// This is the main entry point for file-based extraction. It performs the following steps:
|
|
@@ -196,12 +101,6 @@ pub fn get_pool_sizing_hint(file_size: u64, mime_type: &str) -> PoolSizeHint {
|
|
|
196
101
|
/// # Ok(())
|
|
197
102
|
/// # }
|
|
198
103
|
/// ```
|
|
199
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
200
|
-
skip(config, path),
|
|
201
|
-
fields(
|
|
202
|
-
extraction.filename = tracing::field::Empty,
|
|
203
|
-
)
|
|
204
|
-
))]
|
|
205
104
|
pub async fn extract_file(
|
|
206
105
|
path: impl AsRef<Path>,
|
|
207
106
|
mime_type: Option<&str>,
|
|
@@ -211,119 +110,86 @@ pub async fn extract_file(
|
|
|
211
110
|
|
|
212
111
|
let path = path.as_ref();
|
|
213
112
|
|
|
214
|
-
|
|
215
|
-
{
|
|
216
|
-
let span = tracing::Span::current();
|
|
217
|
-
span.record("extraction.filename", sanitize_path(path));
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
let result = async {
|
|
221
|
-
io::validate_file_exists(path)?;
|
|
113
|
+
io::validate_file_exists(path)?;
|
|
222
114
|
|
|
223
|
-
|
|
115
|
+
let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
|
|
224
116
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
}
|
|
235
|
-
#[cfg(not(feature = "office"))]
|
|
236
|
-
LEGACY_WORD_MIME_TYPE => {
|
|
237
|
-
return Err(KreuzbergError::UnsupportedFormat(
|
|
238
|
-
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
239
|
-
));
|
|
240
|
-
}
|
|
241
|
-
#[cfg(feature = "office")]
|
|
242
|
-
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
243
|
-
let original_bytes = tokio::fs::read(path).await?;
|
|
244
|
-
let conversion = convert_ppt_to_pptx(&original_bytes).await?;
|
|
245
|
-
let mut result =
|
|
246
|
-
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
247
|
-
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
248
|
-
return Ok(result);
|
|
249
|
-
}
|
|
250
|
-
#[cfg(not(feature = "office"))]
|
|
251
|
-
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
252
|
-
return Err(KreuzbergError::UnsupportedFormat(
|
|
253
|
-
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
254
|
-
));
|
|
255
|
-
}
|
|
256
|
-
_ => {}
|
|
117
|
+
match detected_mime.as_str() {
|
|
118
|
+
#[cfg(feature = "office")]
|
|
119
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
120
|
+
let original_bytes = tokio::fs::read(path).await?;
|
|
121
|
+
let conversion = convert_doc_to_docx(&original_bytes).await?;
|
|
122
|
+
let mut result =
|
|
123
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
124
|
+
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
125
|
+
return Ok(result);
|
|
257
126
|
}
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
127
|
+
#[cfg(not(feature = "office"))]
|
|
128
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
129
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
130
|
+
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
131
|
+
));
|
|
132
|
+
}
|
|
133
|
+
#[cfg(feature = "office")]
|
|
134
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
135
|
+
let original_bytes = tokio::fs::read(path).await?;
|
|
136
|
+
let conversion = convert_ppt_to_pptx(&original_bytes).await?;
|
|
137
|
+
let mut result =
|
|
138
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
139
|
+
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
140
|
+
return Ok(result);
|
|
141
|
+
}
|
|
142
|
+
#[cfg(not(feature = "office"))]
|
|
143
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
144
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
145
|
+
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
146
|
+
));
|
|
147
|
+
}
|
|
148
|
+
_ => {}
|
|
266
149
|
}
|
|
267
150
|
|
|
268
|
-
|
|
151
|
+
extract_file_with_extractor(path, &detected_mime, config).await
|
|
269
152
|
}
|
|
270
153
|
|
|
271
154
|
/// Extract content from a byte array.
|
|
272
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
273
|
-
skip(config, content),
|
|
274
|
-
fields(
|
|
275
|
-
extraction.mime_type = mime_type,
|
|
276
|
-
extraction.size_bytes = content.len(),
|
|
277
|
-
)
|
|
278
|
-
))]
|
|
279
155
|
pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
280
156
|
use crate::core::mime;
|
|
281
157
|
|
|
282
|
-
let
|
|
283
|
-
let validated_mime = mime::validate_mime_type(mime_type)?;
|
|
284
|
-
|
|
285
|
-
match validated_mime.as_str() {
|
|
286
|
-
#[cfg(feature = "office")]
|
|
287
|
-
LEGACY_WORD_MIME_TYPE => {
|
|
288
|
-
let conversion = convert_doc_to_docx(content).await?;
|
|
289
|
-
let mut result =
|
|
290
|
-
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
291
|
-
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
292
|
-
return Ok(result);
|
|
293
|
-
}
|
|
294
|
-
#[cfg(not(feature = "office"))]
|
|
295
|
-
LEGACY_WORD_MIME_TYPE => {
|
|
296
|
-
return Err(KreuzbergError::UnsupportedFormat(
|
|
297
|
-
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
298
|
-
));
|
|
299
|
-
}
|
|
300
|
-
#[cfg(feature = "office")]
|
|
301
|
-
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
302
|
-
let conversion = convert_ppt_to_pptx(content).await?;
|
|
303
|
-
let mut result =
|
|
304
|
-
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
305
|
-
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
306
|
-
return Ok(result);
|
|
307
|
-
}
|
|
308
|
-
#[cfg(not(feature = "office"))]
|
|
309
|
-
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
310
|
-
return Err(KreuzbergError::UnsupportedFormat(
|
|
311
|
-
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
312
|
-
));
|
|
313
|
-
}
|
|
314
|
-
_ => {}
|
|
315
|
-
}
|
|
158
|
+
let validated_mime = mime::validate_mime_type(mime_type)?;
|
|
316
159
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
160
|
+
match validated_mime.as_str() {
|
|
161
|
+
#[cfg(feature = "office")]
|
|
162
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
163
|
+
let conversion = convert_doc_to_docx(content).await?;
|
|
164
|
+
let mut result =
|
|
165
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
166
|
+
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
167
|
+
return Ok(result);
|
|
168
|
+
}
|
|
169
|
+
#[cfg(not(feature = "office"))]
|
|
170
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
171
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
172
|
+
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
173
|
+
));
|
|
174
|
+
}
|
|
175
|
+
#[cfg(feature = "office")]
|
|
176
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
177
|
+
let conversion = convert_ppt_to_pptx(content).await?;
|
|
178
|
+
let mut result =
|
|
179
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
180
|
+
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
181
|
+
return Ok(result);
|
|
182
|
+
}
|
|
183
|
+
#[cfg(not(feature = "office"))]
|
|
184
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
185
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
186
|
+
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
187
|
+
));
|
|
188
|
+
}
|
|
189
|
+
_ => {}
|
|
324
190
|
}
|
|
325
191
|
|
|
326
|
-
|
|
192
|
+
extract_bytes_with_extractor(content, &validated_mime, config).await
|
|
327
193
|
}
|
|
328
194
|
|
|
329
195
|
/// Extract content from multiple files concurrently.
|
|
@@ -346,13 +212,6 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
346
212
|
///
|
|
347
213
|
/// Individual file errors are captured in the result metadata. System errors
|
|
348
214
|
/// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
|
349
|
-
#[cfg(feature = "tokio-runtime")]
|
|
350
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
351
|
-
skip(config, paths),
|
|
352
|
-
fields(
|
|
353
|
-
extraction.batch_size = paths.len(),
|
|
354
|
-
)
|
|
355
|
-
))]
|
|
356
215
|
pub async fn batch_extract_file(
|
|
357
216
|
paths: Vec<impl AsRef<Path>>,
|
|
358
217
|
config: &ExtractionConfig,
|
|
@@ -367,9 +226,7 @@ pub async fn batch_extract_file(
|
|
|
367
226
|
|
|
368
227
|
let config = Arc::new(config.clone());
|
|
369
228
|
|
|
370
|
-
let max_concurrent = config
|
|
371
|
-
.max_concurrent_extractions
|
|
372
|
-
.unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
|
|
229
|
+
let max_concurrent = config.max_concurrent_extractions.unwrap_or_else(|| num_cpus::get() * 2);
|
|
373
230
|
let semaphore = Arc::new(Semaphore::new(max_concurrent));
|
|
374
231
|
|
|
375
232
|
let mut tasks = JoinSet::new();
|
|
@@ -396,8 +253,11 @@ pub async fn batch_extract_file(
|
|
|
396
253
|
results[index] = Some(result);
|
|
397
254
|
}
|
|
398
255
|
Ok((index, Err(e))) => {
|
|
399
|
-
//
|
|
400
|
-
|
|
256
|
+
// OSError/RuntimeError must bubble up - system errors need user reports ~keep
|
|
257
|
+
if matches!(e, KreuzbergError::Io(_)) {
|
|
258
|
+
return Err(e);
|
|
259
|
+
}
|
|
260
|
+
|
|
401
261
|
use crate::types::{ErrorMetadata, Metadata};
|
|
402
262
|
let metadata = Metadata {
|
|
403
263
|
error: Some(ErrorMetadata {
|
|
@@ -415,7 +275,6 @@ pub async fn batch_extract_file(
|
|
|
415
275
|
detected_languages: None,
|
|
416
276
|
chunks: None,
|
|
417
277
|
images: None,
|
|
418
|
-
pages: None,
|
|
419
278
|
});
|
|
420
279
|
}
|
|
421
280
|
Err(join_err) => {
|
|
@@ -443,15 +302,8 @@ pub async fn batch_extract_file(
|
|
|
443
302
|
/// # Returns
|
|
444
303
|
///
|
|
445
304
|
/// A vector of `ExtractionResult` in the same order as the input.
|
|
446
|
-
#[cfg(feature = "tokio-runtime")]
|
|
447
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
448
|
-
skip(config, contents),
|
|
449
|
-
fields(
|
|
450
|
-
extraction.batch_size = contents.len(),
|
|
451
|
-
)
|
|
452
|
-
))]
|
|
453
305
|
pub async fn batch_extract_bytes(
|
|
454
|
-
contents: Vec<(
|
|
306
|
+
contents: Vec<(&[u8], &str)>,
|
|
455
307
|
config: &ExtractionConfig,
|
|
456
308
|
) -> Result<Vec<ExtractionResult>> {
|
|
457
309
|
use std::sync::Arc;
|
|
@@ -465,14 +317,17 @@ pub async fn batch_extract_bytes(
|
|
|
465
317
|
let batch_config = config.clone();
|
|
466
318
|
let config = Arc::new(batch_config);
|
|
467
319
|
|
|
468
|
-
let max_concurrent = config
|
|
469
|
-
.max_concurrent_extractions
|
|
470
|
-
.unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
|
|
320
|
+
let max_concurrent = config.max_concurrent_extractions.unwrap_or_else(|| num_cpus::get() * 2);
|
|
471
321
|
let semaphore = Arc::new(Semaphore::new(max_concurrent));
|
|
472
322
|
|
|
323
|
+
let owned_contents: Vec<(Vec<u8>, String)> = contents
|
|
324
|
+
.into_iter()
|
|
325
|
+
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
326
|
+
.collect();
|
|
327
|
+
|
|
473
328
|
let mut tasks = JoinSet::new();
|
|
474
329
|
|
|
475
|
-
for (index, (bytes, mime_type)) in
|
|
330
|
+
for (index, (bytes, mime_type)) in owned_contents.into_iter().enumerate() {
|
|
476
331
|
let config_clone = Arc::clone(&config);
|
|
477
332
|
let semaphore_clone = Arc::clone(&semaphore);
|
|
478
333
|
|
|
@@ -494,8 +349,11 @@ pub async fn batch_extract_bytes(
|
|
|
494
349
|
results[index] = Some(result);
|
|
495
350
|
}
|
|
496
351
|
Ok((index, Err(e))) => {
|
|
497
|
-
//
|
|
498
|
-
|
|
352
|
+
// OSError/RuntimeError must bubble up - system errors need user reports ~keep
|
|
353
|
+
if matches!(e, KreuzbergError::Io(_)) {
|
|
354
|
+
return Err(e);
|
|
355
|
+
}
|
|
356
|
+
|
|
499
357
|
use crate::types::{ErrorMetadata, Metadata};
|
|
500
358
|
let metadata = Metadata {
|
|
501
359
|
error: Some(ErrorMetadata {
|
|
@@ -513,7 +371,6 @@ pub async fn batch_extract_bytes(
|
|
|
513
371
|
detected_languages: None,
|
|
514
372
|
chunks: None,
|
|
515
373
|
images: None,
|
|
516
|
-
pages: None,
|
|
517
374
|
});
|
|
518
375
|
}
|
|
519
376
|
Err(join_err) => {
|
|
@@ -533,10 +390,6 @@ pub async fn batch_extract_bytes(
|
|
|
533
390
|
///
|
|
534
391
|
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|
535
392
|
/// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
|
|
536
|
-
///
|
|
537
|
-
/// This function is only available with the `tokio-runtime` feature. For WASM targets,
|
|
538
|
-
/// use a truly synchronous extraction approach instead.
|
|
539
|
-
#[cfg(feature = "tokio-runtime")]
|
|
540
393
|
pub fn extract_file_sync(
|
|
541
394
|
path: impl AsRef<Path>,
|
|
542
395
|
mime_type: Option<&str>,
|
|
@@ -549,31 +402,14 @@ pub fn extract_file_sync(
|
|
|
549
402
|
///
|
|
550
403
|
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|
551
404
|
/// a new runtime per call.
|
|
552
|
-
///
|
|
553
|
-
/// With the `tokio-runtime` feature, this blocks the current thread using the global
|
|
554
|
-
/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
|
|
555
|
-
#[cfg(feature = "tokio-runtime")]
|
|
556
405
|
pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
557
406
|
GLOBAL_RUNTIME.block_on(extract_bytes(content, mime_type, config))
|
|
558
407
|
}
|
|
559
408
|
|
|
560
|
-
/// Synchronous wrapper for `extract_bytes` (WASM-compatible version).
|
|
561
|
-
///
|
|
562
|
-
/// This is a truly synchronous implementation without tokio runtime dependency.
|
|
563
|
-
/// It calls `extract_bytes_sync_impl()` to perform the extraction.
|
|
564
|
-
#[cfg(not(feature = "tokio-runtime"))]
|
|
565
|
-
pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
566
|
-
extract_bytes_sync_impl(content.to_vec(), Some(mime_type.to_string()), Some(config.clone()))
|
|
567
|
-
}
|
|
568
|
-
|
|
569
409
|
/// Synchronous wrapper for `batch_extract_file`.
|
|
570
410
|
///
|
|
571
411
|
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|
572
412
|
/// a new runtime per call.
|
|
573
|
-
///
|
|
574
|
-
/// This function is only available with the `tokio-runtime` feature. For WASM targets,
|
|
575
|
-
/// use a truly synchronous extraction approach instead.
|
|
576
|
-
#[cfg(feature = "tokio-runtime")]
|
|
577
413
|
pub fn batch_extract_file_sync(
|
|
578
414
|
paths: Vec<impl AsRef<Path>>,
|
|
579
415
|
config: &ExtractionConfig,
|
|
@@ -585,109 +421,13 @@ pub fn batch_extract_file_sync(
|
|
|
585
421
|
///
|
|
586
422
|
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
|
587
423
|
/// a new runtime per call.
|
|
588
|
-
///
|
|
589
|
-
/// With the `tokio-runtime` feature, this blocks the current thread using the global
|
|
590
|
-
/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
|
|
591
|
-
/// that iterates through items and calls `extract_bytes_sync()`.
|
|
592
|
-
#[cfg(feature = "tokio-runtime")]
|
|
593
424
|
pub fn batch_extract_bytes_sync(
|
|
594
|
-
contents: Vec<(
|
|
425
|
+
contents: Vec<(&[u8], &str)>,
|
|
595
426
|
config: &ExtractionConfig,
|
|
596
427
|
) -> Result<Vec<ExtractionResult>> {
|
|
597
428
|
GLOBAL_RUNTIME.block_on(batch_extract_bytes(contents, config))
|
|
598
429
|
}
|
|
599
430
|
|
|
600
|
-
/// Synchronous wrapper for `batch_extract_bytes` (WASM-compatible version).
|
|
601
|
-
///
|
|
602
|
-
/// This is a truly synchronous implementation that iterates through items
|
|
603
|
-
/// and calls `extract_bytes_sync()` for each.
|
|
604
|
-
#[cfg(not(feature = "tokio-runtime"))]
|
|
605
|
-
pub fn batch_extract_bytes_sync(
|
|
606
|
-
contents: Vec<(Vec<u8>, String)>,
|
|
607
|
-
config: &ExtractionConfig,
|
|
608
|
-
) -> Result<Vec<ExtractionResult>> {
|
|
609
|
-
let mut results = Vec::with_capacity(contents.len());
|
|
610
|
-
for (content, mime_type) in contents {
|
|
611
|
-
let result = extract_bytes_sync(&content, &mime_type, config);
|
|
612
|
-
results.push(result.unwrap_or_else(|e| {
|
|
613
|
-
use crate::types::{ErrorMetadata, Metadata};
|
|
614
|
-
ExtractionResult {
|
|
615
|
-
content: format!("Error: {}", e),
|
|
616
|
-
mime_type: pool_mime_type("text/plain"),
|
|
617
|
-
metadata: Metadata {
|
|
618
|
-
error: Some(ErrorMetadata {
|
|
619
|
-
error_type: format!("{:?}", e),
|
|
620
|
-
message: e.to_string(),
|
|
621
|
-
}),
|
|
622
|
-
..Default::default()
|
|
623
|
-
},
|
|
624
|
-
tables: vec![],
|
|
625
|
-
detected_languages: None,
|
|
626
|
-
chunks: None,
|
|
627
|
-
images: None,
|
|
628
|
-
pages: None,
|
|
629
|
-
}
|
|
630
|
-
}));
|
|
631
|
-
}
|
|
632
|
-
Ok(results)
|
|
633
|
-
}
|
|
634
|
-
|
|
635
|
-
/// Synchronous extraction implementation for WASM compatibility.
|
|
636
|
-
///
|
|
637
|
-
/// This function performs extraction without requiring a tokio runtime.
|
|
638
|
-
/// It calls the sync extractor methods directly.
|
|
639
|
-
///
|
|
640
|
-
/// # Arguments
|
|
641
|
-
///
|
|
642
|
-
/// * `content` - The byte content to extract
|
|
643
|
-
/// * `mime_type` - Optional MIME type to validate/use
|
|
644
|
-
/// * `config` - Optional extraction configuration
|
|
645
|
-
///
|
|
646
|
-
/// # Returns
|
|
647
|
-
///
|
|
648
|
-
/// An `ExtractionResult` or a `KreuzbergError`
|
|
649
|
-
///
|
|
650
|
-
/// # Implementation Notes
|
|
651
|
-
///
|
|
652
|
-
/// This is called when the `tokio-runtime` feature is disabled.
|
|
653
|
-
/// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
|
|
654
|
-
#[cfg(not(feature = "tokio-runtime"))]
|
|
655
|
-
fn extract_bytes_sync_impl(
|
|
656
|
-
content: Vec<u8>,
|
|
657
|
-
mime_type: Option<String>,
|
|
658
|
-
config: Option<ExtractionConfig>,
|
|
659
|
-
) -> Result<ExtractionResult> {
|
|
660
|
-
use crate::core::mime;
|
|
661
|
-
|
|
662
|
-
let config = config.unwrap_or_default();
|
|
663
|
-
|
|
664
|
-
let validated_mime = if let Some(mime) = mime_type {
|
|
665
|
-
mime::validate_mime_type(&mime)?
|
|
666
|
-
} else {
|
|
667
|
-
return Err(KreuzbergError::Validation {
|
|
668
|
-
message: "MIME type is required for synchronous extraction".to_string(),
|
|
669
|
-
source: None,
|
|
670
|
-
});
|
|
671
|
-
};
|
|
672
|
-
|
|
673
|
-
crate::extractors::ensure_initialized()?;
|
|
674
|
-
|
|
675
|
-
let extractor = get_extractor(&validated_mime)?;
|
|
676
|
-
|
|
677
|
-
let sync_extractor = extractor.as_sync_extractor().ok_or_else(|| {
|
|
678
|
-
KreuzbergError::UnsupportedFormat(format!(
|
|
679
|
-
"Extractor for '{}' does not support synchronous extraction",
|
|
680
|
-
validated_mime
|
|
681
|
-
))
|
|
682
|
-
})?;
|
|
683
|
-
|
|
684
|
-
let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
|
|
685
|
-
|
|
686
|
-
result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
|
|
687
|
-
|
|
688
|
-
Ok(result)
|
|
689
|
-
}
|
|
690
|
-
|
|
691
431
|
async fn extract_file_with_extractor(
|
|
692
432
|
path: &Path,
|
|
693
433
|
mime_type: &str,
|
|
@@ -714,29 +454,13 @@ async fn extract_bytes_with_extractor(
|
|
|
714
454
|
Ok(result)
|
|
715
455
|
}
|
|
716
456
|
|
|
717
|
-
/// Convert a MIME type string to a pooled String for efficient deduplication.
|
|
718
|
-
///
|
|
719
|
-
/// This function uses the string interning pool to reduce memory allocations
|
|
720
|
-
/// for repeatedly used MIME types (e.g., "application/pdf" appears thousands of times
|
|
721
|
-
/// in batch processing). The interned string is converted to an owned String to satisfy
|
|
722
|
-
/// the ExtractionResult::mime_type field type.
|
|
723
|
-
///
|
|
724
|
-
/// # Performance
|
|
725
|
-
///
|
|
726
|
-
/// For pre-interned MIME types (all common types), this is O(1) pointer dereference.
|
|
727
|
-
/// For unknown MIME types, this allocates once per unique type and caches the result.
|
|
728
|
-
#[allow(dead_code)]
|
|
729
|
-
fn pool_mime_type(mime_type: &str) -> String {
|
|
730
|
-
intern_mime_type(mime_type).to_string()
|
|
731
|
-
}
|
|
732
|
-
|
|
733
457
|
#[cfg(feature = "office")]
|
|
734
458
|
fn apply_libreoffice_metadata(
|
|
735
459
|
result: &mut ExtractionResult,
|
|
736
460
|
legacy_mime: &str,
|
|
737
461
|
conversion: &LibreOfficeConversionResult,
|
|
738
462
|
) {
|
|
739
|
-
result.mime_type =
|
|
463
|
+
result.mime_type = legacy_mime.to_string();
|
|
740
464
|
result.metadata.additional.insert(
|
|
741
465
|
"libreoffice_conversion".to_string(),
|
|
742
466
|
json!({
|
|
@@ -756,10 +480,6 @@ mod tests {
|
|
|
756
480
|
use std::io::Write;
|
|
757
481
|
use tempfile::tempdir;
|
|
758
482
|
|
|
759
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
760
|
-
assert_eq!(actual.trim_end_matches('\n'), expected);
|
|
761
|
-
}
|
|
762
|
-
|
|
763
483
|
#[tokio::test]
|
|
764
484
|
async fn test_extract_file_basic() {
|
|
765
485
|
let dir = tempdir().unwrap();
|
|
@@ -772,7 +492,7 @@ mod tests {
|
|
|
772
492
|
|
|
773
493
|
assert!(result.is_ok());
|
|
774
494
|
let result = result.unwrap();
|
|
775
|
-
|
|
495
|
+
assert_eq!(result.content, "Hello, world!");
|
|
776
496
|
assert_eq!(result.mime_type, "text/plain");
|
|
777
497
|
}
|
|
778
498
|
|
|
@@ -805,7 +525,7 @@ mod tests {
|
|
|
805
525
|
|
|
806
526
|
assert!(result.is_ok());
|
|
807
527
|
let result = result.unwrap();
|
|
808
|
-
|
|
528
|
+
assert_eq!(result.content, "test content");
|
|
809
529
|
assert_eq!(result.mime_type, "text/plain");
|
|
810
530
|
}
|
|
811
531
|
|
|
@@ -833,8 +553,8 @@ mod tests {
|
|
|
833
553
|
assert!(results.is_ok());
|
|
834
554
|
let results = results.unwrap();
|
|
835
555
|
assert_eq!(results.len(), 2);
|
|
836
|
-
|
|
837
|
-
|
|
556
|
+
assert_eq!(results[0].content, "content 1");
|
|
557
|
+
assert_eq!(results[1].content, "content 2");
|
|
838
558
|
}
|
|
839
559
|
|
|
840
560
|
#[tokio::test]
|
|
@@ -854,17 +574,13 @@ mod tests {
|
|
|
854
574
|
(b"content 1".as_slice(), "text/plain"),
|
|
855
575
|
(b"content 2".as_slice(), "text/plain"),
|
|
856
576
|
];
|
|
857
|
-
let
|
|
858
|
-
.into_iter()
|
|
859
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
860
|
-
.collect();
|
|
861
|
-
let results = batch_extract_bytes(owned_contents, &config).await;
|
|
577
|
+
let results = batch_extract_bytes(contents, &config).await;
|
|
862
578
|
|
|
863
579
|
assert!(results.is_ok());
|
|
864
580
|
let results = results.unwrap();
|
|
865
581
|
assert_eq!(results.len(), 2);
|
|
866
|
-
|
|
867
|
-
|
|
582
|
+
assert_eq!(results[0].content, "content 1");
|
|
583
|
+
assert_eq!(results[1].content, "content 2");
|
|
868
584
|
}
|
|
869
585
|
|
|
870
586
|
#[test]
|
|
@@ -877,8 +593,7 @@ mod tests {
|
|
|
877
593
|
|
|
878
594
|
let result = extract_file_sync(&file_path, None, &config);
|
|
879
595
|
assert!(result.is_ok());
|
|
880
|
-
|
|
881
|
-
assert_text_content(&result.content, "sync test");
|
|
596
|
+
assert_eq!(result.unwrap().content, "sync test");
|
|
882
597
|
|
|
883
598
|
let result = extract_bytes_sync(b"test", "text/plain", &config);
|
|
884
599
|
assert!(result.is_ok());
|
|
@@ -890,14 +605,12 @@ mod tests {
|
|
|
890
605
|
|
|
891
606
|
let result1 = extract_bytes(b"test 1", "text/plain", &config).await;
|
|
892
607
|
assert!(result1.is_ok());
|
|
893
|
-
let result1 = result1.unwrap();
|
|
894
608
|
|
|
895
609
|
let result2 = extract_bytes(b"test 2", "text/plain", &config).await;
|
|
896
610
|
assert!(result2.is_ok());
|
|
897
|
-
let result2 = result2.unwrap();
|
|
898
611
|
|
|
899
|
-
|
|
900
|
-
|
|
612
|
+
assert_eq!(result1.unwrap().content, "test 1");
|
|
613
|
+
assert_eq!(result2.unwrap().content, "test 2");
|
|
901
614
|
|
|
902
615
|
let result3 = extract_bytes(b"# test 3", "text/markdown", &config).await;
|
|
903
616
|
assert!(result3.is_ok());
|
|
@@ -963,8 +676,7 @@ mod tests {
|
|
|
963
676
|
let result = extract_file(&file_path, None, &config).await;
|
|
964
677
|
|
|
965
678
|
assert!(result.is_ok());
|
|
966
|
-
|
|
967
|
-
assert_text_content(&result.content, "content");
|
|
679
|
+
assert_eq!(result.unwrap().content, "content");
|
|
968
680
|
}
|
|
969
681
|
|
|
970
682
|
#[tokio::test]
|
|
@@ -1004,7 +716,7 @@ mod tests {
|
|
|
1004
716
|
assert!(results.is_ok());
|
|
1005
717
|
let results = results.unwrap();
|
|
1006
718
|
assert_eq!(results.len(), 2);
|
|
1007
|
-
|
|
719
|
+
assert_eq!(results[0].content, "valid content");
|
|
1008
720
|
assert!(results[1].metadata.error.is_some());
|
|
1009
721
|
}
|
|
1010
722
|
|
|
@@ -1016,18 +728,14 @@ mod tests {
|
|
|
1016
728
|
(b"invalid".as_slice(), "invalid/mime"),
|
|
1017
729
|
(b"valid 2".as_slice(), "text/plain"),
|
|
1018
730
|
];
|
|
1019
|
-
let
|
|
1020
|
-
.into_iter()
|
|
1021
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
1022
|
-
.collect();
|
|
1023
|
-
let results = batch_extract_bytes(owned_contents, &config).await;
|
|
731
|
+
let results = batch_extract_bytes(contents, &config).await;
|
|
1024
732
|
|
|
1025
733
|
assert!(results.is_ok());
|
|
1026
734
|
let results = results.unwrap();
|
|
1027
735
|
assert_eq!(results.len(), 3);
|
|
1028
|
-
|
|
736
|
+
assert_eq!(results[0].content, "valid 1");
|
|
1029
737
|
assert!(results[1].metadata.error.is_some());
|
|
1030
|
-
|
|
738
|
+
assert_eq!(results[2].content, "valid 2");
|
|
1031
739
|
}
|
|
1032
740
|
|
|
1033
741
|
#[tokio::test]
|
|
@@ -1037,11 +745,7 @@ mod tests {
|
|
|
1037
745
|
(b"test 1".as_slice(), "invalid/mime1"),
|
|
1038
746
|
(b"test 2".as_slice(), "invalid/mime2"),
|
|
1039
747
|
];
|
|
1040
|
-
let
|
|
1041
|
-
.into_iter()
|
|
1042
|
-
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
1043
|
-
.collect();
|
|
1044
|
-
let results = batch_extract_bytes(owned_contents, &config).await;
|
|
748
|
+
let results = batch_extract_bytes(contents, &config).await;
|
|
1045
749
|
|
|
1046
750
|
assert!(results.is_ok());
|
|
1047
751
|
let results = results.unwrap();
|
|
@@ -1058,8 +762,7 @@ mod tests {
|
|
|
1058
762
|
|
|
1059
763
|
assert!(result.is_ok());
|
|
1060
764
|
let result = result.unwrap();
|
|
1061
|
-
|
|
1062
|
-
assert_eq!(trimmed_len, 10_000_000);
|
|
765
|
+
assert_eq!(result.content.len(), 10_000_000);
|
|
1063
766
|
}
|
|
1064
767
|
|
|
1065
768
|
#[tokio::test]
|
|
@@ -1084,7 +787,7 @@ mod tests {
|
|
|
1084
787
|
assert_eq!(results.len(), 100);
|
|
1085
788
|
|
|
1086
789
|
for (i, result) in results.iter().enumerate() {
|
|
1087
|
-
|
|
790
|
+
assert_eq!(result.content, format!("content {}", i));
|
|
1088
791
|
}
|
|
1089
792
|
}
|
|
1090
793
|
|
|
@@ -1137,7 +840,7 @@ mod tests {
|
|
|
1137
840
|
#[test]
|
|
1138
841
|
fn test_sync_wrapper_batch_bytes_empty() {
|
|
1139
842
|
let config = ExtractionConfig::default();
|
|
1140
|
-
let contents: Vec<(
|
|
843
|
+
let contents: Vec<(&[u8], &str)> = vec![];
|
|
1141
844
|
let results = batch_extract_bytes_sync(contents, &config);
|
|
1142
845
|
|
|
1143
846
|
assert!(results.is_ok());
|