kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -4,69 +4,13 @@
|
|
|
4
4
|
//! All extractors implement the `DocumentExtractor` plugin trait.
|
|
5
5
|
|
|
6
6
|
use crate::Result;
|
|
7
|
-
use crate::core::config::ExtractionConfig;
|
|
8
7
|
use crate::plugins::registry::get_document_extractor_registry;
|
|
9
|
-
use crate::types::ExtractionResult;
|
|
10
8
|
use once_cell::sync::Lazy;
|
|
11
9
|
use std::sync::Arc;
|
|
12
10
|
|
|
13
|
-
/// Trait for extractors that can work synchronously (WASM-compatible).
|
|
14
|
-
///
|
|
15
|
-
/// This trait defines the synchronous extraction interface for WASM targets and other
|
|
16
|
-
/// environments where async/tokio runtimes are not available or desirable.
|
|
17
|
-
///
|
|
18
|
-
/// # Implementation
|
|
19
|
-
///
|
|
20
|
-
/// Extractors that need to support WASM should implement this trait in addition to
|
|
21
|
-
/// the async `DocumentExtractor` trait. This allows the same extractor to work in both
|
|
22
|
-
/// environments by delegating to the sync implementation.
|
|
23
|
-
///
|
|
24
|
-
/// # MIME Type Validation
|
|
25
|
-
///
|
|
26
|
-
/// The `mime_type` parameter is guaranteed to be already validated.
|
|
27
|
-
///
|
|
28
|
-
/// # Example
|
|
29
|
-
///
|
|
30
|
-
/// ```rust,ignore
|
|
31
|
-
/// impl SyncExtractor for PlainTextExtractor {
|
|
32
|
-
/// fn extract_sync(&self, content: &[u8], config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
33
|
-
/// let text = String::from_utf8_lossy(content).to_string();
|
|
34
|
-
/// Ok(ExtractionResult {
|
|
35
|
-
/// content: text,
|
|
36
|
-
/// mime_type: "text/plain".to_string(),
|
|
37
|
-
/// metadata: Metadata::default(),
|
|
38
|
-
/// tables: vec![],
|
|
39
|
-
/// detected_languages: None,
|
|
40
|
-
/// chunks: None,
|
|
41
|
-
/// images: None,
|
|
42
|
-
/// })
|
|
43
|
-
/// }
|
|
44
|
-
/// }
|
|
45
|
-
/// ```
|
|
46
|
-
pub trait SyncExtractor {
|
|
47
|
-
/// Extract content from a byte array synchronously.
|
|
48
|
-
///
|
|
49
|
-
/// This method performs extraction without requiring an async runtime.
|
|
50
|
-
/// It is called by `extract_bytes_sync()` when the `tokio-runtime` feature is disabled.
|
|
51
|
-
///
|
|
52
|
-
/// # Arguments
|
|
53
|
-
///
|
|
54
|
-
/// * `content` - Raw document bytes
|
|
55
|
-
/// * `mime_type` - MIME type of the document (already validated)
|
|
56
|
-
/// * `config` - Extraction configuration
|
|
57
|
-
///
|
|
58
|
-
/// # Returns
|
|
59
|
-
///
|
|
60
|
-
/// An `ExtractionResult` containing the extracted content and metadata.
|
|
61
|
-
fn extract_sync(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult>;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
11
|
pub mod structured;
|
|
65
12
|
pub mod text;
|
|
66
13
|
|
|
67
|
-
#[cfg(feature = "archives")]
|
|
68
|
-
pub mod security;
|
|
69
|
-
|
|
70
14
|
#[cfg(feature = "ocr")]
|
|
71
15
|
pub mod image;
|
|
72
16
|
|
|
@@ -83,59 +27,20 @@ pub mod excel;
|
|
|
83
27
|
pub mod html;
|
|
84
28
|
|
|
85
29
|
#[cfg(feature = "office")]
|
|
86
|
-
pub mod bibtex;
|
|
87
|
-
|
|
88
|
-
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
89
30
|
pub mod docx;
|
|
90
31
|
|
|
91
32
|
#[cfg(feature = "office")]
|
|
92
|
-
pub mod
|
|
93
|
-
|
|
94
|
-
#[cfg(feature = "office")]
|
|
95
|
-
pub mod fictionbook;
|
|
96
|
-
|
|
97
|
-
#[cfg(feature = "office")]
|
|
98
|
-
pub mod markdown;
|
|
99
|
-
|
|
100
|
-
#[cfg(feature = "office")]
|
|
101
|
-
pub mod rst;
|
|
102
|
-
|
|
103
|
-
#[cfg(feature = "office")]
|
|
104
|
-
pub mod latex;
|
|
105
|
-
|
|
106
|
-
#[cfg(feature = "office")]
|
|
107
|
-
pub mod jupyter;
|
|
108
|
-
|
|
109
|
-
#[cfg(feature = "office")]
|
|
110
|
-
pub mod orgmode;
|
|
111
|
-
|
|
112
|
-
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
113
|
-
pub mod odt;
|
|
114
|
-
|
|
115
|
-
#[cfg(feature = "office")]
|
|
116
|
-
pub mod opml;
|
|
117
|
-
|
|
118
|
-
#[cfg(feature = "office")]
|
|
119
|
-
pub mod typst;
|
|
120
|
-
|
|
121
|
-
#[cfg(feature = "xml")]
|
|
122
|
-
pub mod jats;
|
|
33
|
+
pub mod pandoc;
|
|
123
34
|
|
|
124
35
|
#[cfg(feature = "pdf")]
|
|
125
36
|
pub mod pdf;
|
|
126
37
|
|
|
127
|
-
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
128
|
-
pub mod pptx;
|
|
129
|
-
|
|
130
38
|
#[cfg(feature = "office")]
|
|
131
|
-
pub mod
|
|
39
|
+
pub mod pptx;
|
|
132
40
|
|
|
133
41
|
#[cfg(feature = "xml")]
|
|
134
42
|
pub mod xml;
|
|
135
43
|
|
|
136
|
-
#[cfg(feature = "xml")]
|
|
137
|
-
pub mod docbook;
|
|
138
|
-
|
|
139
44
|
pub use structured::StructuredExtractor;
|
|
140
45
|
pub use text::{MarkdownExtractor, PlainTextExtractor};
|
|
141
46
|
|
|
@@ -155,59 +60,20 @@ pub use excel::ExcelExtractor;
|
|
|
155
60
|
pub use html::HtmlExtractor;
|
|
156
61
|
|
|
157
62
|
#[cfg(feature = "office")]
|
|
158
|
-
pub use bibtex::BibtexExtractor;
|
|
159
|
-
|
|
160
|
-
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
161
63
|
pub use docx::DocxExtractor;
|
|
162
64
|
|
|
163
65
|
#[cfg(feature = "office")]
|
|
164
|
-
pub use
|
|
165
|
-
|
|
166
|
-
#[cfg(feature = "office")]
|
|
167
|
-
pub use fictionbook::FictionBookExtractor;
|
|
168
|
-
|
|
169
|
-
#[cfg(feature = "office")]
|
|
170
|
-
pub use markdown::MarkdownExtractor as EnhancedMarkdownExtractor;
|
|
171
|
-
|
|
172
|
-
#[cfg(feature = "office")]
|
|
173
|
-
pub use rst::RstExtractor;
|
|
174
|
-
|
|
175
|
-
#[cfg(feature = "office")]
|
|
176
|
-
pub use latex::LatexExtractor;
|
|
177
|
-
|
|
178
|
-
#[cfg(feature = "office")]
|
|
179
|
-
pub use jupyter::JupyterExtractor;
|
|
180
|
-
|
|
181
|
-
#[cfg(feature = "office")]
|
|
182
|
-
pub use orgmode::OrgModeExtractor;
|
|
183
|
-
|
|
184
|
-
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
185
|
-
pub use odt::OdtExtractor;
|
|
186
|
-
|
|
187
|
-
#[cfg(feature = "xml")]
|
|
188
|
-
pub use jats::JatsExtractor;
|
|
189
|
-
|
|
190
|
-
#[cfg(feature = "office")]
|
|
191
|
-
pub use opml::OpmlExtractor;
|
|
192
|
-
|
|
193
|
-
#[cfg(feature = "office")]
|
|
194
|
-
pub use typst::TypstExtractor;
|
|
66
|
+
pub use pandoc::PandocExtractor;
|
|
195
67
|
|
|
196
68
|
#[cfg(feature = "pdf")]
|
|
197
69
|
pub use pdf::PdfExtractor;
|
|
198
70
|
|
|
199
|
-
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
200
|
-
pub use pptx::PptxExtractor;
|
|
201
|
-
|
|
202
71
|
#[cfg(feature = "office")]
|
|
203
|
-
pub use
|
|
72
|
+
pub use pptx::PptxExtractor;
|
|
204
73
|
|
|
205
74
|
#[cfg(feature = "xml")]
|
|
206
75
|
pub use xml::XmlExtractor;
|
|
207
76
|
|
|
208
|
-
#[cfg(feature = "xml")]
|
|
209
|
-
pub use docbook::DocbookExtractor;
|
|
210
|
-
|
|
211
77
|
/// Lazy-initialized flag that ensures extractors are registered exactly once.
|
|
212
78
|
///
|
|
213
79
|
/// This static is accessed on first extraction operation to automatically
|
|
@@ -220,6 +86,7 @@ static EXTRACTORS_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_default_ext
|
|
|
220
86
|
/// It's safe to call multiple times - registration only happens once,
|
|
221
87
|
/// unless the registry was cleared, in which case extractors are re-registered.
|
|
222
88
|
pub fn ensure_initialized() -> Result<()> {
|
|
89
|
+
// First, try the lazy initialization
|
|
223
90
|
EXTRACTORS_INITIALIZED
|
|
224
91
|
.as_ref()
|
|
225
92
|
.map(|_| ())
|
|
@@ -228,12 +95,15 @@ pub fn ensure_initialized() -> Result<()> {
|
|
|
228
95
|
plugin_name: "built-in-extractors".to_string(),
|
|
229
96
|
})?;
|
|
230
97
|
|
|
98
|
+
// Check if registry is empty (e.g., after clear_document_extractors)
|
|
99
|
+
// If so, re-register the default extractors
|
|
231
100
|
let registry = get_document_extractor_registry();
|
|
232
101
|
let registry_guard = registry
|
|
233
102
|
.read()
|
|
234
103
|
.map_err(|e| crate::KreuzbergError::Other(format!("Document extractor registry lock poisoned: {}", e)))?;
|
|
235
104
|
|
|
236
105
|
if registry_guard.list().is_empty() {
|
|
106
|
+
// Drop read lock before acquiring write lock
|
|
237
107
|
drop(registry_guard);
|
|
238
108
|
register_default_extractors()?;
|
|
239
109
|
}
|
|
@@ -282,25 +152,10 @@ pub fn register_default_extractors() -> Result<()> {
|
|
|
282
152
|
registry.register(Arc::new(ExcelExtractor::new()))?;
|
|
283
153
|
|
|
284
154
|
#[cfg(feature = "office")]
|
|
285
|
-
{
|
|
286
|
-
registry.register(Arc::new(EnhancedMarkdownExtractor::new()))?;
|
|
287
|
-
registry.register(Arc::new(BibtexExtractor::new()))?;
|
|
288
|
-
registry.register(Arc::new(EpubExtractor::new()))?;
|
|
289
|
-
registry.register(Arc::new(FictionBookExtractor::new()))?;
|
|
290
|
-
registry.register(Arc::new(RtfExtractor::new()))?;
|
|
291
|
-
registry.register(Arc::new(RstExtractor::new()))?;
|
|
292
|
-
registry.register(Arc::new(LatexExtractor::new()))?;
|
|
293
|
-
registry.register(Arc::new(JupyterExtractor::new()))?;
|
|
294
|
-
registry.register(Arc::new(OrgModeExtractor::new()))?;
|
|
295
|
-
registry.register(Arc::new(OpmlExtractor::new()))?;
|
|
296
|
-
registry.register(Arc::new(TypstExtractor::new()))?;
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
300
155
|
{
|
|
301
156
|
registry.register(Arc::new(DocxExtractor::new()))?;
|
|
302
157
|
registry.register(Arc::new(PptxExtractor::new()))?;
|
|
303
|
-
registry.register(Arc::new(
|
|
158
|
+
registry.register(Arc::new(PandocExtractor::new()))?;
|
|
304
159
|
}
|
|
305
160
|
|
|
306
161
|
#[cfg(feature = "email")]
|
|
@@ -371,27 +226,11 @@ mod tests {
|
|
|
371
226
|
}
|
|
372
227
|
|
|
373
228
|
#[cfg(feature = "office")]
|
|
374
|
-
{
|
|
375
|
-
expected_count += 10;
|
|
376
|
-
assert!(extractor_names.contains(&"markdown-extractor".to_string()));
|
|
377
|
-
assert!(extractor_names.contains(&"bibtex-extractor".to_string()));
|
|
378
|
-
assert!(extractor_names.contains(&"epub-extractor".to_string()));
|
|
379
|
-
assert!(extractor_names.contains(&"fictionbook-extractor".to_string()));
|
|
380
|
-
assert!(extractor_names.contains(&"rtf-extractor".to_string()));
|
|
381
|
-
assert!(extractor_names.contains(&"rst-extractor".to_string()));
|
|
382
|
-
assert!(extractor_names.contains(&"latex-extractor".to_string()));
|
|
383
|
-
assert!(extractor_names.contains(&"jupyter-extractor".to_string()));
|
|
384
|
-
assert!(extractor_names.contains(&"orgmode-extractor".to_string()));
|
|
385
|
-
assert!(extractor_names.contains(&"opml-extractor".to_string()));
|
|
386
|
-
assert!(extractor_names.contains(&"typst-extractor".to_string()));
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
#[cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
390
229
|
{
|
|
391
230
|
expected_count += 3;
|
|
392
231
|
assert!(extractor_names.contains(&"docx-extractor".to_string()));
|
|
393
232
|
assert!(extractor_names.contains(&"pptx-extractor".to_string()));
|
|
394
|
-
assert!(extractor_names.contains(&"
|
|
233
|
+
assert!(extractor_names.contains(&"pandoc-extractor".to_string()));
|
|
395
234
|
}
|
|
396
235
|
|
|
397
236
|
#[cfg(feature = "email")]
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
//! Pandoc-based extractors for various document formats.
|
|
2
|
+
//!
|
|
3
|
+
//! Supports: DOCX, ODT, EPUB, LaTeX, RST, RTF, and many more formats via Pandoc.
|
|
4
|
+
|
|
5
|
+
use crate::Result;
|
|
6
|
+
use crate::core::config::ExtractionConfig;
|
|
7
|
+
use crate::extraction::pandoc::extract_bytes_from_mime;
|
|
8
|
+
use crate::plugins::{DocumentExtractor, Plugin};
|
|
9
|
+
use crate::types::{ExtractionResult, Metadata};
|
|
10
|
+
use async_trait::async_trait;
|
|
11
|
+
|
|
12
|
+
/// Generic Pandoc extractor for all Pandoc-supported formats.
|
|
13
|
+
///
|
|
14
|
+
/// This extractor handles all document formats supported by Pandoc, including:
|
|
15
|
+
/// - Microsoft Word (DOCX)
|
|
16
|
+
/// - OpenDocument Text (ODT)
|
|
17
|
+
/// - EPUB
|
|
18
|
+
/// - LaTeX
|
|
19
|
+
/// - reStructuredText (RST)
|
|
20
|
+
/// - RTF
|
|
21
|
+
/// - And many more
|
|
22
|
+
pub struct PandocExtractor;
|
|
23
|
+
|
|
24
|
+
impl PandocExtractor {
|
|
25
|
+
/// Create a new Pandoc extractor.
|
|
26
|
+
pub fn new() -> Self {
|
|
27
|
+
Self
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
impl Default for PandocExtractor {
|
|
32
|
+
fn default() -> Self {
|
|
33
|
+
Self::new()
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
impl Plugin for PandocExtractor {
|
|
38
|
+
fn name(&self) -> &str {
|
|
39
|
+
"pandoc-extractor"
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
fn version(&self) -> String {
|
|
43
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
fn initialize(&self) -> Result<()> {
|
|
47
|
+
Ok(())
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
fn shutdown(&self) -> Result<()> {
|
|
51
|
+
Ok(())
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
fn description(&self) -> &str {
|
|
55
|
+
"Extracts content from Pandoc-supported formats (DOCX, ODT, EPUB, LaTeX, RST, RTF, etc.)"
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
fn author(&self) -> &str {
|
|
59
|
+
"Kreuzberg Team"
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
#[async_trait]
|
|
64
|
+
impl DocumentExtractor for PandocExtractor {
|
|
65
|
+
async fn extract_bytes(
|
|
66
|
+
&self,
|
|
67
|
+
content: &[u8],
|
|
68
|
+
mime_type: &str,
|
|
69
|
+
_config: &ExtractionConfig,
|
|
70
|
+
) -> Result<ExtractionResult> {
|
|
71
|
+
let pandoc_result = extract_bytes_from_mime(content, mime_type).await?;
|
|
72
|
+
|
|
73
|
+
let mut additional = std::collections::HashMap::new();
|
|
74
|
+
for (key, value) in pandoc_result.metadata {
|
|
75
|
+
additional.insert(key, value);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
Ok(ExtractionResult {
|
|
79
|
+
content: pandoc_result.content,
|
|
80
|
+
mime_type: mime_type.to_string(),
|
|
81
|
+
metadata: Metadata {
|
|
82
|
+
additional,
|
|
83
|
+
..Default::default()
|
|
84
|
+
},
|
|
85
|
+
tables: vec![],
|
|
86
|
+
detected_languages: None,
|
|
87
|
+
chunks: None,
|
|
88
|
+
images: None,
|
|
89
|
+
})
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
fn supported_mime_types(&self) -> &[&str] {
|
|
93
|
+
&[
|
|
94
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
95
|
+
"application/vnd.oasis.opendocument.text",
|
|
96
|
+
"application/epub+zip",
|
|
97
|
+
"application/x-latex",
|
|
98
|
+
"text/x-tex",
|
|
99
|
+
"text/x-rst",
|
|
100
|
+
"text/prs.fallenstein.rst",
|
|
101
|
+
"application/rtf",
|
|
102
|
+
"text/rtf",
|
|
103
|
+
"application/x-typst",
|
|
104
|
+
"application/x-ipynb+json",
|
|
105
|
+
"application/x-fictionbook+xml",
|
|
106
|
+
"text/x-org",
|
|
107
|
+
"text/x-commonmark",
|
|
108
|
+
"text/x-gfm",
|
|
109
|
+
"text/x-multimarkdown",
|
|
110
|
+
"text/x-markdown-extra",
|
|
111
|
+
"application/docbook+xml",
|
|
112
|
+
"application/x-jats+xml",
|
|
113
|
+
"application/x-opml+xml",
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
fn priority(&self) -> i32 {
|
|
118
|
+
40
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
#[cfg(test)]
|
|
123
|
+
mod tests {
|
|
124
|
+
use super::*;
|
|
125
|
+
use crate::extraction::pandoc::validate_pandoc_version;
|
|
126
|
+
|
|
127
|
+
#[tokio::test]
|
|
128
|
+
async fn test_pandoc_extractor_plugin_interface() {
|
|
129
|
+
let extractor = PandocExtractor::new();
|
|
130
|
+
assert_eq!(extractor.name(), "pandoc-extractor");
|
|
131
|
+
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
132
|
+
assert_eq!(extractor.priority(), 40);
|
|
133
|
+
assert!(!extractor.supported_mime_types().is_empty());
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
#[tokio::test]
|
|
137
|
+
async fn test_pandoc_extractor_supports_docx() {
|
|
138
|
+
let extractor = PandocExtractor::new();
|
|
139
|
+
assert!(
|
|
140
|
+
extractor
|
|
141
|
+
.supported_mime_types()
|
|
142
|
+
.contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
|
143
|
+
);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
#[tokio::test]
|
|
147
|
+
async fn test_pandoc_extractor_supports_odt() {
|
|
148
|
+
let extractor = PandocExtractor::new();
|
|
149
|
+
assert!(
|
|
150
|
+
extractor
|
|
151
|
+
.supported_mime_types()
|
|
152
|
+
.contains(&"application/vnd.oasis.opendocument.text")
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
#[tokio::test]
|
|
157
|
+
async fn test_pandoc_extractor_supports_epub() {
|
|
158
|
+
let extractor = PandocExtractor::new();
|
|
159
|
+
assert!(extractor.supported_mime_types().contains(&"application/epub+zip"));
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
#[tokio::test]
|
|
163
|
+
async fn test_pandoc_extractor_supports_latex() {
|
|
164
|
+
let extractor = PandocExtractor::new();
|
|
165
|
+
assert!(extractor.supported_mime_types().contains(&"application/x-latex"));
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
#[tokio::test]
|
|
169
|
+
async fn test_pandoc_extractor_supports_rst() {
|
|
170
|
+
let extractor = PandocExtractor::new();
|
|
171
|
+
assert!(extractor.supported_mime_types().contains(&"text/x-rst"));
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
#[tokio::test]
|
|
175
|
+
async fn test_pandoc_extractor_markdown() {
|
|
176
|
+
if validate_pandoc_version().await.is_err() {
|
|
177
|
+
return;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
let extractor = PandocExtractor::new();
|
|
181
|
+
let markdown = b"# Hello World\n\nThis is a test.";
|
|
182
|
+
let config = ExtractionConfig::default();
|
|
183
|
+
|
|
184
|
+
let result = extractor.extract_bytes(markdown, "text/x-rst", &config).await;
|
|
185
|
+
|
|
186
|
+
let _ = result;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
#[tokio::test]
|
|
190
|
+
async fn test_pandoc_extractor_default() {
|
|
191
|
+
let extractor = PandocExtractor;
|
|
192
|
+
assert_eq!(extractor.name(), "pandoc-extractor");
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
#[tokio::test]
|
|
196
|
+
async fn test_pandoc_extractor_initialize_shutdown() {
|
|
197
|
+
let extractor = PandocExtractor::new();
|
|
198
|
+
assert!(extractor.initialize().is_ok());
|
|
199
|
+
assert!(extractor.shutdown().is_ok());
|
|
200
|
+
}
|
|
201
|
+
}
|