kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -1,231 +0,0 @@
|
|
|
1
|
-
//! Quality processing post-processor.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides a PostProcessor plugin that performs quality assessment and
|
|
4
|
-
//! text cleaning on extraction results.
|
|
5
|
-
//!
|
|
6
|
-
//! # Performance
|
|
7
|
-
//!
|
|
8
|
-
//! This processor optimizes metadata handling by:
|
|
9
|
-
//! - Checking if important metadata fields exist before allocating
|
|
10
|
-
//! - Converting to HashMap only when beneficial metadata is present
|
|
11
|
-
//! - Skipping allocation entirely for documents without metadata
|
|
12
|
-
//!
|
|
13
|
-
//! This avoids unnecessary string cloning for sparse metadata scenarios.
|
|
14
|
-
|
|
15
|
-
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
16
|
-
use crate::{ExtractionConfig, ExtractionResult, Result};
|
|
17
|
-
use async_trait::async_trait;
|
|
18
|
-
|
|
19
|
-
/// Post-processor that calculates quality score and cleans text.
|
|
20
|
-
///
|
|
21
|
-
/// This processor:
|
|
22
|
-
/// - Runs in the Early processing stage
|
|
23
|
-
/// - Calculates quality score when `config.enable_quality_processing` is true
|
|
24
|
-
/// - Stores quality score in `metadata.additional["quality_score"]`
|
|
25
|
-
/// - Cleans and normalizes extracted text
|
|
26
|
-
///
|
|
27
|
-
/// # Example
|
|
28
|
-
///
|
|
29
|
-
/// ```rust,no_run
|
|
30
|
-
/// use kreuzberg::plugins::{Plugin, PostProcessor};
|
|
31
|
-
/// use kreuzberg::text::QualityProcessor;
|
|
32
|
-
///
|
|
33
|
-
/// let processor = QualityProcessor;
|
|
34
|
-
/// assert_eq!(processor.name(), "quality-processing");
|
|
35
|
-
/// ```
|
|
36
|
-
#[derive(Debug, Clone, Copy)]
|
|
37
|
-
pub struct QualityProcessor;
|
|
38
|
-
|
|
39
|
-
impl Plugin for QualityProcessor {
|
|
40
|
-
fn name(&self) -> &str {
|
|
41
|
-
"quality-processing"
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
fn version(&self) -> String {
|
|
45
|
-
env!("CARGO_PKG_VERSION").to_string()
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
fn initialize(&self) -> Result<()> {
|
|
49
|
-
Ok(())
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
fn shutdown(&self) -> Result<()> {
|
|
53
|
-
Ok(())
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
|
|
58
|
-
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
59
|
-
impl PostProcessor for QualityProcessor {
|
|
60
|
-
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
61
|
-
let quality_score = if should_use_metadata(&result.metadata) {
|
|
62
|
-
crate::text::quality::calculate_quality_score(&result.content, Some(&result.metadata.additional))
|
|
63
|
-
} else {
|
|
64
|
-
crate::text::quality::calculate_quality_score(&result.content, None)
|
|
65
|
-
};
|
|
66
|
-
|
|
67
|
-
result.metadata.additional.insert(
|
|
68
|
-
"quality_score".to_string(),
|
|
69
|
-
serde_json::Value::Number(
|
|
70
|
-
serde_json::Number::from_f64(quality_score).unwrap_or(serde_json::Number::from(0)),
|
|
71
|
-
),
|
|
72
|
-
);
|
|
73
|
-
|
|
74
|
-
Ok(())
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
fn processing_stage(&self) -> ProcessingStage {
|
|
78
|
-
ProcessingStage::Early
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
|
82
|
-
config.enable_quality_processing
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
|
|
86
|
-
let text_length = result.content.len();
|
|
87
|
-
(text_length / 102400).max(1) as u64
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
/// Check if metadata contains any important fields without allocation.
|
|
92
|
-
///
|
|
93
|
-
/// # Performance
|
|
94
|
-
///
|
|
95
|
-
/// O(1) check avoiding HashMap allocation when metadata is sparse.
|
|
96
|
-
/// Only allocates HashMap when important metadata fields are present.
|
|
97
|
-
fn should_use_metadata(metadata: &crate::types::Metadata) -> bool {
|
|
98
|
-
const IMPORTANT_FIELDS: &[&str] = &["title", "author", "subject", "description", "keywords"];
|
|
99
|
-
IMPORTANT_FIELDS
|
|
100
|
-
.iter()
|
|
101
|
-
.any(|field| metadata.additional.contains_key(*field))
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
#[cfg(test)]
|
|
105
|
-
mod tests {
|
|
106
|
-
use super::*;
|
|
107
|
-
use crate::types::Metadata;
|
|
108
|
-
|
|
109
|
-
#[tokio::test]
|
|
110
|
-
async fn test_quality_processor() {
|
|
111
|
-
let processor = QualityProcessor;
|
|
112
|
-
let config = ExtractionConfig {
|
|
113
|
-
enable_quality_processing: true,
|
|
114
|
-
..Default::default()
|
|
115
|
-
};
|
|
116
|
-
|
|
117
|
-
let mut result = ExtractionResult {
|
|
118
|
-
content: "This is a well-written paragraph with proper structure. It contains multiple sentences. The quality should be good.".to_string(),
|
|
119
|
-
mime_type: "text/plain".to_string(),
|
|
120
|
-
metadata: Metadata::default(),
|
|
121
|
-
tables: vec![],
|
|
122
|
-
detected_languages: None,
|
|
123
|
-
chunks: None,
|
|
124
|
-
images: None,
|
|
125
|
-
pages: None,
|
|
126
|
-
};
|
|
127
|
-
|
|
128
|
-
processor.process(&mut result, &config).await.unwrap();
|
|
129
|
-
|
|
130
|
-
assert!(result.metadata.additional.contains_key("quality_score"));
|
|
131
|
-
let score = result.metadata.additional.get("quality_score").unwrap();
|
|
132
|
-
assert!(score.is_number());
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
#[tokio::test]
|
|
136
|
-
async fn test_quality_processor_disabled() {
|
|
137
|
-
let processor = QualityProcessor;
|
|
138
|
-
let config = ExtractionConfig {
|
|
139
|
-
enable_quality_processing: false,
|
|
140
|
-
..Default::default()
|
|
141
|
-
};
|
|
142
|
-
|
|
143
|
-
let mut result = ExtractionResult {
|
|
144
|
-
content: "Some text".to_string(),
|
|
145
|
-
mime_type: "text/plain".to_string(),
|
|
146
|
-
metadata: Metadata::default(),
|
|
147
|
-
tables: vec![],
|
|
148
|
-
detected_languages: None,
|
|
149
|
-
chunks: None,
|
|
150
|
-
images: None,
|
|
151
|
-
pages: None,
|
|
152
|
-
};
|
|
153
|
-
|
|
154
|
-
processor.process(&mut result, &config).await.unwrap();
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
#[test]
|
|
158
|
-
fn test_quality_processor_plugin_interface() {
|
|
159
|
-
let processor = QualityProcessor;
|
|
160
|
-
assert_eq!(processor.name(), "quality-processing");
|
|
161
|
-
assert!(!processor.version().is_empty());
|
|
162
|
-
assert!(processor.initialize().is_ok());
|
|
163
|
-
assert!(processor.shutdown().is_ok());
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
#[test]
|
|
167
|
-
fn test_quality_processor_stage() {
|
|
168
|
-
let processor = QualityProcessor;
|
|
169
|
-
assert_eq!(processor.processing_stage(), ProcessingStage::Early);
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
#[test]
|
|
173
|
-
fn test_quality_processor_should_process() {
|
|
174
|
-
let processor = QualityProcessor;
|
|
175
|
-
|
|
176
|
-
let result = ExtractionResult {
|
|
177
|
-
content: "Sample text".to_string(),
|
|
178
|
-
mime_type: "text/plain".to_string(),
|
|
179
|
-
metadata: Metadata::default(),
|
|
180
|
-
tables: vec![],
|
|
181
|
-
detected_languages: None,
|
|
182
|
-
chunks: None,
|
|
183
|
-
images: None,
|
|
184
|
-
pages: None,
|
|
185
|
-
};
|
|
186
|
-
|
|
187
|
-
let config_with_quality = ExtractionConfig {
|
|
188
|
-
enable_quality_processing: true,
|
|
189
|
-
..Default::default()
|
|
190
|
-
};
|
|
191
|
-
assert!(processor.should_process(&result, &config_with_quality));
|
|
192
|
-
|
|
193
|
-
let config_without_quality = ExtractionConfig {
|
|
194
|
-
enable_quality_processing: false,
|
|
195
|
-
..Default::default()
|
|
196
|
-
};
|
|
197
|
-
assert!(!processor.should_process(&result, &config_without_quality));
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
#[test]
|
|
201
|
-
fn test_quality_processor_estimated_duration() {
|
|
202
|
-
let processor = QualityProcessor;
|
|
203
|
-
|
|
204
|
-
let short_result = ExtractionResult {
|
|
205
|
-
content: "Short".to_string(),
|
|
206
|
-
mime_type: "text/plain".to_string(),
|
|
207
|
-
metadata: Metadata::default(),
|
|
208
|
-
tables: vec![],
|
|
209
|
-
detected_languages: None,
|
|
210
|
-
chunks: None,
|
|
211
|
-
images: None,
|
|
212
|
-
pages: None,
|
|
213
|
-
};
|
|
214
|
-
|
|
215
|
-
let long_result = ExtractionResult {
|
|
216
|
-
content: "a".repeat(1000000),
|
|
217
|
-
mime_type: "text/plain".to_string(),
|
|
218
|
-
metadata: Metadata::default(),
|
|
219
|
-
tables: vec![],
|
|
220
|
-
detected_languages: None,
|
|
221
|
-
chunks: None,
|
|
222
|
-
images: None,
|
|
223
|
-
pages: None,
|
|
224
|
-
};
|
|
225
|
-
|
|
226
|
-
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
227
|
-
let long_duration = processor.estimated_duration_ms(&long_result);
|
|
228
|
-
|
|
229
|
-
assert!(long_duration > short_duration);
|
|
230
|
-
}
|
|
231
|
-
}
|
|
@@ -1,193 +0,0 @@
|
|
|
1
|
-
//! SIMD-accelerated UTF-8 validation.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides high-performance UTF-8 validation using SIMD instructions
|
|
4
|
-
//! when available. On platforms without SIMD support, it falls back to standard
|
|
5
|
-
//! validation.
|
|
6
|
-
//!
|
|
7
|
-
//! # Performance
|
|
8
|
-
//!
|
|
9
|
-
//! SIMD validation can process 16-32 bytes per cycle, providing 15-20% improvement
|
|
10
|
-
//! over standard byte-by-byte validation on text-heavy operations.
|
|
11
|
-
//!
|
|
12
|
-
//! # Example
|
|
13
|
-
//!
|
|
14
|
-
//! ```rust
|
|
15
|
-
//! use kreuzberg::text::utf8_validation::from_utf8;
|
|
16
|
-
//!
|
|
17
|
-
//! let bytes = b"Hello, UTF-8 world!";
|
|
18
|
-
//! let result = from_utf8(bytes).expect("valid UTF-8");
|
|
19
|
-
//! assert_eq!(result, "Hello, UTF-8 world!");
|
|
20
|
-
//! ```
|
|
21
|
-
|
|
22
|
-
/// Validates and converts bytes to string using SIMD when available.
|
|
23
|
-
///
|
|
24
|
-
/// This function attempts to use SIMD UTF-8 validation if the `simd-utf8` feature
|
|
25
|
-
/// is enabled and the platform supports it. Otherwise, it falls back to the standard
|
|
26
|
-
/// `std::str::from_utf8()` validation.
|
|
27
|
-
///
|
|
28
|
-
/// # Arguments
|
|
29
|
-
///
|
|
30
|
-
/// * `bytes` - The byte slice to validate and convert
|
|
31
|
-
///
|
|
32
|
-
/// # Returns
|
|
33
|
-
///
|
|
34
|
-
/// `Ok(&str)` if the bytes are valid UTF-8, `Err(std::str::Utf8Error)` otherwise.
|
|
35
|
-
///
|
|
36
|
-
/// # Safety
|
|
37
|
-
///
|
|
38
|
-
/// This function is safe and does not use any unsafe code directly. The underlying
|
|
39
|
-
/// SIMD validation (when enabled) is contained within the simdutf8 crate and is safe.
|
|
40
|
-
#[inline]
|
|
41
|
-
pub fn from_utf8(bytes: &[u8]) -> Result<&str, std::str::Utf8Error> {
|
|
42
|
-
#[cfg(feature = "simd-utf8")]
|
|
43
|
-
{
|
|
44
|
-
simdutf8::basic::from_utf8(bytes).map_err(|_| {
|
|
45
|
-
#[allow(invalid_from_utf8)]
|
|
46
|
-
let err = std::str::from_utf8(&[0xFF, 0xFF, 0xFF, 0xFF]).unwrap_err();
|
|
47
|
-
err
|
|
48
|
-
})
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
#[cfg(not(feature = "simd-utf8"))]
|
|
52
|
-
{
|
|
53
|
-
std::str::from_utf8(bytes)
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
/// Validates and converts owned bytes to String using SIMD when available.
|
|
58
|
-
///
|
|
59
|
-
/// This function converts bytes to an owned String, validating UTF-8 using SIMD
|
|
60
|
-
/// when available. The caller's bytes are consumed to create the String.
|
|
61
|
-
///
|
|
62
|
-
/// # Arguments
|
|
63
|
-
///
|
|
64
|
-
/// * `bytes` - The byte vector to validate and convert
|
|
65
|
-
///
|
|
66
|
-
/// # Returns
|
|
67
|
-
///
|
|
68
|
-
/// `Ok(String)` if the bytes are valid UTF-8, `Err(std::string::FromUtf8Error)` otherwise.
|
|
69
|
-
///
|
|
70
|
-
/// # Performance
|
|
71
|
-
///
|
|
72
|
-
/// When enabled, SIMD validation significantly reduces the time spent on validation,
|
|
73
|
-
/// especially for large text documents.
|
|
74
|
-
#[inline]
|
|
75
|
-
pub fn string_from_utf8(bytes: Vec<u8>) -> Result<String, std::string::FromUtf8Error> {
|
|
76
|
-
#[cfg(feature = "simd-utf8")]
|
|
77
|
-
{
|
|
78
|
-
#[allow(clippy::collapsible_if)]
|
|
79
|
-
if simdutf8::basic::from_utf8(&bytes).is_ok() {
|
|
80
|
-
#[allow(unsafe_code)]
|
|
81
|
-
Ok(unsafe { String::from_utf8_unchecked(bytes) })
|
|
82
|
-
} else {
|
|
83
|
-
String::from_utf8(bytes)
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
#[cfg(not(feature = "simd-utf8"))]
|
|
88
|
-
{
|
|
89
|
-
String::from_utf8(bytes)
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
/// Validates bytes as UTF-8 without conversion to string slice.
|
|
94
|
-
///
|
|
95
|
-
/// Returns `true` if the bytes represent valid UTF-8, `false` otherwise.
|
|
96
|
-
/// This is useful when you only need to check validity without constructing a string.
|
|
97
|
-
///
|
|
98
|
-
/// # Arguments
|
|
99
|
-
///
|
|
100
|
-
/// * `bytes` - The byte slice to validate
|
|
101
|
-
///
|
|
102
|
-
/// # Returns
|
|
103
|
-
///
|
|
104
|
-
/// `true` if valid UTF-8, `false` otherwise.
|
|
105
|
-
///
|
|
106
|
-
/// # Performance
|
|
107
|
-
///
|
|
108
|
-
/// This function is optimized for early exit on invalid sequences.
|
|
109
|
-
#[inline]
|
|
110
|
-
pub fn is_valid_utf8(bytes: &[u8]) -> bool {
|
|
111
|
-
#[cfg(feature = "simd-utf8")]
|
|
112
|
-
{
|
|
113
|
-
simdutf8::basic::from_utf8(bytes).is_ok()
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
#[cfg(not(feature = "simd-utf8"))]
|
|
117
|
-
{
|
|
118
|
-
std::str::from_utf8(bytes).is_ok()
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
#[cfg(test)]
|
|
123
|
-
mod tests {
|
|
124
|
-
use super::*;
|
|
125
|
-
|
|
126
|
-
#[test]
|
|
127
|
-
fn test_valid_ascii() {
|
|
128
|
-
let bytes = b"Hello, world!";
|
|
129
|
-
let result = from_utf8(bytes).unwrap();
|
|
130
|
-
assert_eq!(result, "Hello, world!");
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
#[test]
|
|
134
|
-
fn test_valid_utf8_multibyte() {
|
|
135
|
-
let bytes = "Hello, 世界! 🌍".as_bytes();
|
|
136
|
-
let result = from_utf8(bytes).unwrap();
|
|
137
|
-
assert_eq!(result, "Hello, 世界! 🌍");
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
#[test]
|
|
141
|
-
fn test_empty_bytes() {
|
|
142
|
-
let bytes = b"";
|
|
143
|
-
let result = from_utf8(bytes).unwrap();
|
|
144
|
-
assert_eq!(result, "");
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
#[test]
|
|
148
|
-
fn test_invalid_utf8() {
|
|
149
|
-
let bytes: &[u8] = &[0xFF, 0xFE];
|
|
150
|
-
assert!(from_utf8(bytes).is_err());
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
#[test]
|
|
154
|
-
fn test_is_valid_utf8_true() {
|
|
155
|
-
let bytes = "Valid UTF-8 text".as_bytes();
|
|
156
|
-
assert!(is_valid_utf8(bytes));
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
#[test]
|
|
160
|
-
fn test_is_valid_utf8_false() {
|
|
161
|
-
let bytes: &[u8] = &[0xC0, 0x80];
|
|
162
|
-
assert!(!is_valid_utf8(bytes));
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
#[test]
|
|
166
|
-
fn test_string_from_utf8_valid() {
|
|
167
|
-
let bytes = b"Test string".to_vec();
|
|
168
|
-
let result = string_from_utf8(bytes).unwrap();
|
|
169
|
-
assert_eq!(result, "Test string");
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
#[test]
|
|
173
|
-
fn test_string_from_utf8_invalid() {
|
|
174
|
-
let bytes: Vec<u8> = vec![0xFF, 0xFE];
|
|
175
|
-
assert!(string_from_utf8(bytes).is_err());
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
#[test]
|
|
179
|
-
fn test_large_valid_text() {
|
|
180
|
-
let text = "a".repeat(100_000);
|
|
181
|
-
let bytes = text.as_bytes();
|
|
182
|
-
let result = from_utf8(bytes).unwrap();
|
|
183
|
-
assert_eq!(result.len(), 100_000);
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
#[test]
|
|
187
|
-
fn test_mixed_unicode() {
|
|
188
|
-
let text = "Latin αλφα 中文 العربية עברית Кириллица";
|
|
189
|
-
let bytes = text.as_bytes();
|
|
190
|
-
let result = from_utf8(bytes).unwrap();
|
|
191
|
-
assert_eq!(result, text);
|
|
192
|
-
}
|
|
193
|
-
}
|