kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -47,7 +47,6 @@
|
|
|
47
47
|
//! # detected_languages: None,
|
|
48
48
|
//! # chunks: None,
|
|
49
49
|
//! # images: None,
|
|
50
|
-
//! # pages: None,
|
|
51
50
|
//! # })
|
|
52
51
|
//! # }
|
|
53
52
|
//! # async fn extract_file(&self, _: &std::path::Path, _: &str, _: &kreuzberg::ExtractionConfig)
|
|
@@ -60,7 +59,6 @@
|
|
|
60
59
|
//! # detected_languages: None,
|
|
61
60
|
//! # chunks: None,
|
|
62
61
|
//! # images: None,
|
|
63
|
-
//! # pages: None,
|
|
64
62
|
//! # })
|
|
65
63
|
//! # }
|
|
66
64
|
//! # fn supported_mime_types(&self) -> &[&str] { &[] }
|
|
@@ -122,7 +120,6 @@
|
|
|
122
120
|
//! detected_languages: None,
|
|
123
121
|
//! chunks: None,
|
|
124
122
|
//! images: None,
|
|
125
|
-
//! pages: None,
|
|
126
123
|
//! })
|
|
127
124
|
//! }
|
|
128
125
|
//!
|
|
@@ -10,9 +10,6 @@ use async_trait::async_trait;
|
|
|
10
10
|
use std::path::Path;
|
|
11
11
|
use std::sync::Arc;
|
|
12
12
|
|
|
13
|
-
#[cfg(not(feature = "tokio-runtime"))]
|
|
14
|
-
use crate::KreuzbergError;
|
|
15
|
-
|
|
16
13
|
/// OCR backend types.
|
|
17
14
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
18
15
|
pub enum OcrBackendType {
|
|
@@ -67,7 +64,6 @@ pub enum OcrBackendType {
|
|
|
67
64
|
/// detected_languages: None,
|
|
68
65
|
/// chunks: None,
|
|
69
66
|
/// images: None,
|
|
70
|
-
/// pages: None,
|
|
71
67
|
/// })
|
|
72
68
|
/// }
|
|
73
69
|
///
|
|
@@ -85,8 +81,7 @@ pub enum OcrBackendType {
|
|
|
85
81
|
/// }
|
|
86
82
|
/// }
|
|
87
83
|
/// ```
|
|
88
|
-
#[
|
|
89
|
-
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
84
|
+
#[async_trait]
|
|
90
85
|
pub trait OcrBackend: Plugin {
|
|
91
86
|
/// Process an image and extract text via OCR.
|
|
92
87
|
///
|
|
@@ -146,7 +141,6 @@ pub trait OcrBackend: Plugin {
|
|
|
146
141
|
/// detected_languages: None,
|
|
147
142
|
/// chunks: None,
|
|
148
143
|
/// images: None,
|
|
149
|
-
/// pages: None,
|
|
150
144
|
/// })
|
|
151
145
|
/// }
|
|
152
146
|
/// # }
|
|
@@ -167,19 +161,9 @@ pub trait OcrBackend: Plugin {
|
|
|
167
161
|
///
|
|
168
162
|
/// Same as `process_image`, plus file I/O errors.
|
|
169
163
|
async fn process_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
let bytes = io::read_file_async(path).await?;
|
|
174
|
-
self.process_image(&bytes, config).await
|
|
175
|
-
}
|
|
176
|
-
#[cfg(not(feature = "tokio-runtime"))]
|
|
177
|
-
{
|
|
178
|
-
let _ = (path, config);
|
|
179
|
-
Err(KreuzbergError::Other(
|
|
180
|
-
"File-based OCR processing requires the tokio-runtime feature".to_string(),
|
|
181
|
-
))
|
|
182
|
-
}
|
|
164
|
+
use crate::core::io;
|
|
165
|
+
let bytes = io::read_file_async(path).await?;
|
|
166
|
+
self.process_image(&bytes, config).await
|
|
183
167
|
}
|
|
184
168
|
|
|
185
169
|
/// Check if this backend supports a given language code.
|
|
@@ -268,6 +252,8 @@ pub trait OcrBackend: Plugin {
|
|
|
268
252
|
}
|
|
269
253
|
}
|
|
270
254
|
|
|
255
|
+
// Public registration APIs
|
|
256
|
+
|
|
271
257
|
/// Register an OCR backend with the global registry.
|
|
272
258
|
///
|
|
273
259
|
/// The OCR backend will be registered with its name from the `name()` method
|
|
@@ -317,7 +303,6 @@ pub trait OcrBackend: Plugin {
|
|
|
317
303
|
/// detected_languages: None,
|
|
318
304
|
/// chunks: None,
|
|
319
305
|
/// images: None,
|
|
320
|
-
/// pages: None,
|
|
321
306
|
/// })
|
|
322
307
|
/// }
|
|
323
308
|
/// fn supports_language(&self, _: &str) -> bool { true }
|
|
@@ -335,6 +320,8 @@ pub fn register_ocr_backend(backend: Arc<dyn OcrBackend>) -> crate::Result<()> {
|
|
|
335
320
|
|
|
336
321
|
let registry = get_ocr_backend_registry();
|
|
337
322
|
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
323
|
+
// This is a critical runtime error (similar to OOM) that should bubble up
|
|
324
|
+
// as it indicates the registry is in an inconsistent state.
|
|
338
325
|
let mut registry = registry
|
|
339
326
|
.write()
|
|
340
327
|
.expect("OCR backend registry lock poisoned - critical runtime error");
|
|
@@ -370,6 +357,8 @@ pub fn unregister_ocr_backend(name: &str) -> crate::Result<()> {
|
|
|
370
357
|
|
|
371
358
|
let registry = get_ocr_backend_registry();
|
|
372
359
|
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
360
|
+
// This is a critical runtime error (similar to OOM) that should bubble up
|
|
361
|
+
// as it indicates the registry is in an inconsistent state.
|
|
373
362
|
let mut registry = registry
|
|
374
363
|
.write()
|
|
375
364
|
.expect("OCR backend registry lock poisoned - critical runtime error");
|
|
@@ -403,6 +392,8 @@ pub fn list_ocr_backends() -> crate::Result<Vec<String>> {
|
|
|
403
392
|
|
|
404
393
|
let registry = get_ocr_backend_registry();
|
|
405
394
|
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
395
|
+
// This is a critical runtime error (similar to OOM) that should bubble up
|
|
396
|
+
// as it indicates the registry is in an inconsistent state.
|
|
406
397
|
let registry = registry
|
|
407
398
|
.read()
|
|
408
399
|
.expect("OCR backend registry lock poisoned - critical runtime error");
|
|
@@ -434,6 +425,8 @@ pub fn clear_ocr_backends() -> crate::Result<()> {
|
|
|
434
425
|
|
|
435
426
|
let registry = get_ocr_backend_registry();
|
|
436
427
|
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
428
|
+
// This is a critical runtime error (similar to OOM) that should bubble up
|
|
429
|
+
// as it indicates the registry is in an inconsistent state.
|
|
437
430
|
let mut registry = registry
|
|
438
431
|
.write()
|
|
439
432
|
.expect("OCR backend registry lock poisoned - critical runtime error");
|
|
@@ -478,7 +471,6 @@ mod tests {
|
|
|
478
471
|
detected_languages: None,
|
|
479
472
|
chunks: None,
|
|
480
473
|
images: None,
|
|
481
|
-
pages: None,
|
|
482
474
|
})
|
|
483
475
|
}
|
|
484
476
|
|
|
@@ -105,8 +105,7 @@ pub enum ProcessingStage {
|
|
|
105
105
|
/// }
|
|
106
106
|
/// }
|
|
107
107
|
/// ```
|
|
108
|
-
#[
|
|
109
|
-
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
108
|
+
#[async_trait]
|
|
110
109
|
pub trait PostProcessor: Plugin {
|
|
111
110
|
/// Process an extraction result.
|
|
112
111
|
///
|
|
@@ -373,7 +372,6 @@ mod tests {
|
|
|
373
372
|
detected_languages: None,
|
|
374
373
|
chunks: None,
|
|
375
374
|
images: None,
|
|
376
|
-
pages: None,
|
|
377
375
|
};
|
|
378
376
|
|
|
379
377
|
let config = ExtractionConfig::default();
|
|
@@ -423,7 +421,6 @@ mod tests {
|
|
|
423
421
|
detected_languages: None,
|
|
424
422
|
chunks: None,
|
|
425
423
|
images: None,
|
|
426
|
-
pages: None,
|
|
427
424
|
};
|
|
428
425
|
|
|
429
426
|
let config = ExtractionConfig::default();
|
|
@@ -490,7 +487,6 @@ mod tests {
|
|
|
490
487
|
detected_languages: None,
|
|
491
488
|
chunks: None,
|
|
492
489
|
images: None,
|
|
493
|
-
pages: None,
|
|
494
490
|
};
|
|
495
491
|
|
|
496
492
|
let config = ExtractionConfig::default();
|
|
@@ -516,7 +512,6 @@ mod tests {
|
|
|
516
512
|
additional,
|
|
517
513
|
..Default::default()
|
|
518
514
|
},
|
|
519
|
-
pages: None,
|
|
520
515
|
tables: vec![],
|
|
521
516
|
detected_languages: None,
|
|
522
517
|
chunks: None,
|
|
@@ -547,7 +542,6 @@ mod tests {
|
|
|
547
542
|
detected_languages: None,
|
|
548
543
|
chunks: None,
|
|
549
544
|
images: None,
|
|
550
|
-
pages: None,
|
|
551
545
|
};
|
|
552
546
|
|
|
553
547
|
assert_eq!(processor.estimated_duration_ms(&result), 0);
|
|
@@ -598,7 +592,6 @@ mod tests {
|
|
|
598
592
|
detected_languages: None,
|
|
599
593
|
chunks: None,
|
|
600
594
|
images: None,
|
|
601
|
-
pages: None,
|
|
602
595
|
};
|
|
603
596
|
|
|
604
597
|
let txt_result = ExtractionResult {
|
|
@@ -609,7 +602,6 @@ mod tests {
|
|
|
609
602
|
detected_languages: None,
|
|
610
603
|
chunks: None,
|
|
611
604
|
images: None,
|
|
612
|
-
pages: None,
|
|
613
605
|
};
|
|
614
606
|
|
|
615
607
|
assert!(processor.should_process(&pdf_result, &config));
|
|
@@ -638,7 +630,6 @@ mod tests {
|
|
|
638
630
|
detected_languages: None,
|
|
639
631
|
chunks: None,
|
|
640
632
|
images: None,
|
|
641
|
-
pages: None,
|
|
642
633
|
};
|
|
643
634
|
|
|
644
635
|
let config = ExtractionConfig::default();
|
|
@@ -264,19 +264,10 @@ impl DocumentExtractorRegistry {
|
|
|
264
264
|
/// # Returns
|
|
265
265
|
///
|
|
266
266
|
/// The highest priority extractor, or an error if none found.
|
|
267
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
268
|
-
skip(self),
|
|
269
|
-
fields(
|
|
270
|
-
registry.mime_type = %mime_type,
|
|
271
|
-
registry.found = tracing::field::Empty,
|
|
272
|
-
)
|
|
273
|
-
))]
|
|
274
267
|
pub fn get(&self, mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
|
|
275
268
|
if let Some(priority_map) = self.extractors.get(mime_type)
|
|
276
269
|
&& let Some((_priority, extractor)) = priority_map.iter().next_back()
|
|
277
270
|
{
|
|
278
|
-
#[cfg(feature = "otel")]
|
|
279
|
-
tracing::Span::current().record("registry.found", true);
|
|
280
271
|
return Ok(Arc::clone(extractor));
|
|
281
272
|
}
|
|
282
273
|
|
|
@@ -302,13 +293,9 @@ impl DocumentExtractorRegistry {
|
|
|
302
293
|
}
|
|
303
294
|
|
|
304
295
|
if let Some((_priority, extractor)) = best_match {
|
|
305
|
-
#[cfg(feature = "otel")]
|
|
306
|
-
tracing::Span::current().record("registry.found", true);
|
|
307
296
|
return Ok(extractor);
|
|
308
297
|
}
|
|
309
298
|
|
|
310
|
-
#[cfg(feature = "otel")]
|
|
311
|
-
tracing::Span::current().record("registry.found", false);
|
|
312
299
|
Err(KreuzbergError::UnsupportedFormat(mime_type.to_string()))
|
|
313
300
|
}
|
|
314
301
|
|
|
@@ -661,7 +648,6 @@ mod tests {
|
|
|
661
648
|
detected_languages: None,
|
|
662
649
|
chunks: None,
|
|
663
650
|
images: None,
|
|
664
|
-
pages: None,
|
|
665
651
|
})
|
|
666
652
|
}
|
|
667
653
|
|
|
@@ -706,7 +692,6 @@ mod tests {
|
|
|
706
692
|
detected_languages: None,
|
|
707
693
|
chunks: None,
|
|
708
694
|
images: None,
|
|
709
|
-
pages: None,
|
|
710
695
|
})
|
|
711
696
|
}
|
|
712
697
|
|
|
@@ -68,8 +68,7 @@ use std::sync::Arc;
|
|
|
68
68
|
/// }
|
|
69
69
|
/// }
|
|
70
70
|
/// ```
|
|
71
|
-
#[
|
|
72
|
-
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
71
|
+
#[async_trait]
|
|
73
72
|
pub trait Validator: Plugin {
|
|
74
73
|
/// Validate an extraction result.
|
|
75
74
|
///
|
|
@@ -276,6 +275,8 @@ pub trait Validator: Plugin {
|
|
|
276
275
|
}
|
|
277
276
|
}
|
|
278
277
|
|
|
278
|
+
// Public registration APIs
|
|
279
|
+
|
|
279
280
|
/// Register a validator with the global registry.
|
|
280
281
|
///
|
|
281
282
|
/// The validator will be registered with its default priority and will be called
|
|
@@ -489,7 +490,6 @@ mod tests {
|
|
|
489
490
|
detected_languages: None,
|
|
490
491
|
chunks: None,
|
|
491
492
|
images: None,
|
|
492
|
-
pages: None,
|
|
493
493
|
};
|
|
494
494
|
|
|
495
495
|
let config = ExtractionConfig::default();
|
|
@@ -508,7 +508,6 @@ mod tests {
|
|
|
508
508
|
detected_languages: None,
|
|
509
509
|
chunks: None,
|
|
510
510
|
images: None,
|
|
511
|
-
pages: None,
|
|
512
511
|
};
|
|
513
512
|
|
|
514
513
|
let config = ExtractionConfig::default();
|
|
@@ -529,7 +528,6 @@ mod tests {
|
|
|
529
528
|
detected_languages: None,
|
|
530
529
|
chunks: None,
|
|
531
530
|
images: None,
|
|
532
|
-
pages: None,
|
|
533
531
|
};
|
|
534
532
|
|
|
535
533
|
let config = ExtractionConfig::default();
|
|
@@ -565,7 +563,6 @@ mod tests {
|
|
|
565
563
|
detected_languages: None,
|
|
566
564
|
chunks: None,
|
|
567
565
|
images: None,
|
|
568
|
-
pages: None,
|
|
569
566
|
};
|
|
570
567
|
|
|
571
568
|
let config = ExtractionConfig::default();
|
|
@@ -613,7 +610,6 @@ mod tests {
|
|
|
613
610
|
detected_languages: None,
|
|
614
611
|
chunks: None,
|
|
615
612
|
images: None,
|
|
616
|
-
pages: None,
|
|
617
613
|
};
|
|
618
614
|
|
|
619
615
|
let txt_result = ExtractionResult {
|
|
@@ -624,7 +620,6 @@ mod tests {
|
|
|
624
620
|
detected_languages: None,
|
|
625
621
|
chunks: None,
|
|
626
622
|
images: None,
|
|
627
|
-
pages: None,
|
|
628
623
|
};
|
|
629
624
|
|
|
630
625
|
assert!(validator.should_validate(&pdf_result, &config));
|
|
@@ -708,7 +703,6 @@ mod tests {
|
|
|
708
703
|
detected_languages: None,
|
|
709
704
|
chunks: None,
|
|
710
705
|
images: None,
|
|
711
|
-
pages: None,
|
|
712
706
|
};
|
|
713
707
|
|
|
714
708
|
let config = ExtractionConfig::default();
|
|
@@ -736,7 +730,6 @@ mod tests {
|
|
|
736
730
|
additional,
|
|
737
731
|
..Default::default()
|
|
738
732
|
},
|
|
739
|
-
pages: None,
|
|
740
733
|
tables: vec![],
|
|
741
734
|
detected_languages: None,
|
|
742
735
|
chunks: None,
|
|
@@ -767,7 +760,6 @@ mod tests {
|
|
|
767
760
|
detected_languages: None,
|
|
768
761
|
chunks: None,
|
|
769
762
|
images: None,
|
|
770
|
-
pages: None,
|
|
771
763
|
};
|
|
772
764
|
|
|
773
765
|
let config = ExtractionConfig::default();
|
|
@@ -796,7 +788,6 @@ mod tests {
|
|
|
796
788
|
detected_languages: None,
|
|
797
789
|
chunks: None,
|
|
798
790
|
images: None,
|
|
799
|
-
pages: None,
|
|
800
791
|
};
|
|
801
792
|
|
|
802
793
|
assert!(validator.validate(&result, &config).await.is_ok());
|
|
@@ -815,15 +806,15 @@ mod tests {
|
|
|
815
806
|
detected_languages: None,
|
|
816
807
|
chunks: None,
|
|
817
808
|
images: None,
|
|
818
|
-
pages: None,
|
|
819
809
|
};
|
|
820
810
|
|
|
821
811
|
let config = ExtractionConfig::default();
|
|
822
812
|
assert!(validator.validate(&result, &config).await.is_ok());
|
|
823
813
|
}
|
|
824
814
|
|
|
815
|
+
// Tests for public registration APIs
|
|
816
|
+
|
|
825
817
|
#[test]
|
|
826
|
-
#[serial_test::serial]
|
|
827
818
|
fn test_register_validator() {
|
|
828
819
|
use std::sync::Arc;
|
|
829
820
|
|
|
@@ -835,7 +826,6 @@ mod tests {
|
|
|
835
826
|
}
|
|
836
827
|
|
|
837
828
|
#[test]
|
|
838
|
-
#[serial_test::serial]
|
|
839
829
|
fn test_unregister_validator() {
|
|
840
830
|
use std::sync::Arc;
|
|
841
831
|
|
|
@@ -847,20 +837,19 @@ mod tests {
|
|
|
847
837
|
}
|
|
848
838
|
|
|
849
839
|
#[test]
|
|
850
|
-
#[serial_test::serial]
|
|
851
840
|
fn test_unregister_nonexistent_validator() {
|
|
852
841
|
let result = super::unregister_validator("nonexistent-validator-xyz");
|
|
853
842
|
assert!(result.is_ok());
|
|
854
843
|
}
|
|
855
844
|
|
|
856
845
|
#[test]
|
|
857
|
-
#[serial_test::serial]
|
|
858
846
|
fn test_list_validators() {
|
|
859
847
|
use std::sync::Arc;
|
|
860
848
|
|
|
861
849
|
super::clear_validators().unwrap();
|
|
862
850
|
|
|
863
851
|
let validator1 = Arc::new(MockValidator { should_fail: false });
|
|
852
|
+
// Both validators have the same name, so only one will be registered
|
|
864
853
|
let validator2 = Arc::new(MockValidator { should_fail: false });
|
|
865
854
|
|
|
866
855
|
let list_before = super::list_validators().unwrap();
|
|
@@ -870,6 +859,7 @@ mod tests {
|
|
|
870
859
|
super::register_validator(validator2).unwrap();
|
|
871
860
|
|
|
872
861
|
let list = super::list_validators().unwrap();
|
|
862
|
+
// Only 1 validator registered since they have the same name
|
|
873
863
|
assert_eq!(list.len(), 1);
|
|
874
864
|
assert!(list.contains(&"mock-validator".to_string()));
|
|
875
865
|
|
|
@@ -877,7 +867,6 @@ mod tests {
|
|
|
877
867
|
}
|
|
878
868
|
|
|
879
869
|
#[test]
|
|
880
|
-
#[serial_test::serial]
|
|
881
870
|
fn test_clear_validators() {
|
|
882
871
|
use std::sync::Arc;
|
|
883
872
|
|
|
@@ -889,6 +878,7 @@ mod tests {
|
|
|
889
878
|
super::register_validator(validator1).unwrap();
|
|
890
879
|
super::register_validator(validator2).unwrap();
|
|
891
880
|
|
|
881
|
+
// Verify at least one validator is registered
|
|
892
882
|
let list_before = super::list_validators().unwrap();
|
|
893
883
|
assert!(!list_before.is_empty());
|
|
894
884
|
|
|
@@ -900,7 +890,6 @@ mod tests {
|
|
|
900
890
|
}
|
|
901
891
|
|
|
902
892
|
#[test]
|
|
903
|
-
#[serial_test::serial]
|
|
904
893
|
fn test_register_validator_with_invalid_name() {
|
|
905
894
|
use std::sync::Arc;
|
|
906
895
|
|
|
@@ -933,7 +922,6 @@ mod tests {
|
|
|
933
922
|
}
|
|
934
923
|
|
|
935
924
|
#[test]
|
|
936
|
-
#[serial_test::serial]
|
|
937
925
|
fn test_register_validator_with_empty_name() {
|
|
938
926
|
use std::sync::Arc;
|
|
939
927
|
|
|
@@ -100,7 +100,7 @@ macro_rules! embed_stopwords {
|
|
|
100
100
|
panic!(
|
|
101
101
|
"Failed to parse embedded stopwords for language '{}': {}. \
|
|
102
102
|
This indicates corrupted or malformed JSON in the embedded stopwords data. \
|
|
103
|
-
Please report this issue at https://github.com/
|
|
103
|
+
Please report this issue at https://github.com/Goldziher/kreuzberg/issues",
|
|
104
104
|
$lang, e
|
|
105
105
|
);
|
|
106
106
|
}
|
|
@@ -1437,7 +1437,7 @@ mod tests {
|
|
|
1437
1437
|
let duration = start.elapsed();
|
|
1438
1438
|
|
|
1439
1439
|
assert!(
|
|
1440
|
-
duration.as_millis() <
|
|
1440
|
+
duration.as_millis() < 100,
|
|
1441
1441
|
"30,000 lookups took too long: {:?}",
|
|
1442
1442
|
duration
|
|
1443
1443
|
);
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
pub mod utf8_validation;
|
|
2
|
-
|
|
3
1
|
#[cfg(feature = "quality")]
|
|
4
2
|
pub mod quality;
|
|
5
3
|
|
|
@@ -9,15 +7,9 @@ pub mod string_utils;
|
|
|
9
7
|
#[cfg(feature = "quality")]
|
|
10
8
|
pub mod token_reduction;
|
|
11
9
|
|
|
12
|
-
#[cfg(feature = "quality")]
|
|
13
|
-
pub mod quality_processor;
|
|
14
|
-
|
|
15
10
|
#[cfg(feature = "quality")]
|
|
16
11
|
pub use quality::{calculate_quality_score, clean_extracted_text, normalize_spaces};
|
|
17
12
|
|
|
18
|
-
#[cfg(feature = "quality")]
|
|
19
|
-
pub use quality_processor::QualityProcessor;
|
|
20
|
-
|
|
21
13
|
#[cfg(feature = "quality")]
|
|
22
14
|
pub use string_utils::{calculate_text_confidence, fix_mojibake, get_encoding_cache_key, safe_decode};
|
|
23
15
|
|
|
@@ -39,23 +39,6 @@ static MALFORMED_WORDS_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
|
39
39
|
static EXCESSIVE_WHITESPACE_PATTERN: Lazy<Regex> =
|
|
40
40
|
Lazy::new(|| Regex::new(r"\s{3,}").expect("Excessive whitespace regex pattern is valid and should compile"));
|
|
41
41
|
|
|
42
|
-
/// Combined OCR artifact pattern for single-pass scanning (used in calculate_ocr_penalty).
|
|
43
|
-
/// This pattern combines 5 of the 6 OCR patterns with alternation to reduce regex passes
|
|
44
|
-
/// from 5 separate find_iter calls to 1. The dash pattern is handled separately due to
|
|
45
|
-
/// line-based context checking.
|
|
46
|
-
static COMBINED_OCR_ARTIFACTS_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
47
|
-
Regex::new(
|
|
48
|
-
r"(?x)
|
|
49
|
-
\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b | # Scattered chars
|
|
50
|
-
[.]{3,}|[_]{3,} | # Repeated punctuation
|
|
51
|
-
\s[.,;:!?]\s | # Isolated punctuation
|
|
52
|
-
\b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b | # Malformed words
|
|
53
|
-
\s{3,} # Excessive whitespace
|
|
54
|
-
",
|
|
55
|
-
)
|
|
56
|
-
.expect("Combined OCR artifacts regex pattern is valid and should compile")
|
|
57
|
-
});
|
|
58
|
-
|
|
59
42
|
static JS_FUNCTION_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
60
43
|
Regex::new(r"(?i)function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}")
|
|
61
44
|
.expect("JavaScript function regex pattern is valid and should compile")
|
|
@@ -123,7 +106,7 @@ where
|
|
|
123
106
|
}
|
|
124
107
|
}
|
|
125
108
|
|
|
126
|
-
pub fn calculate_quality_score(text: &str, metadata: Option<&HashMap<String,
|
|
109
|
+
pub fn calculate_quality_score(text: &str, metadata: Option<&HashMap<String, String>>) -> f64 {
|
|
127
110
|
if text.is_empty() || text.trim().is_empty() {
|
|
128
111
|
return 0.0;
|
|
129
112
|
}
|
|
@@ -168,8 +151,12 @@ fn calculate_ocr_penalty(text: &str, total_chars: f64) -> f64 {
|
|
|
168
151
|
return 0.0;
|
|
169
152
|
}
|
|
170
153
|
|
|
171
|
-
let artifact_chars =
|
|
172
|
-
sum_match_lengths(text, &
|
|
154
|
+
let artifact_chars = sum_match_lengths(text, &SCATTERED_CHARS_PATTERN)
|
|
155
|
+
+ sum_match_lengths(text, &REPEATED_PUNCT_PATTERN)
|
|
156
|
+
+ count_non_table_dash_artifacts(text)
|
|
157
|
+
+ sum_match_lengths(text, &ISOLATED_PUNCT_PATTERN)
|
|
158
|
+
+ sum_match_lengths(text, &MALFORMED_WORDS_PATTERN)
|
|
159
|
+
+ sum_match_lengths(text, &EXCESSIVE_WHITESPACE_PATTERN);
|
|
173
160
|
|
|
174
161
|
(artifact_chars as f64 / total_chars).min(1.0)
|
|
175
162
|
}
|
|
@@ -266,7 +253,7 @@ fn calculate_structure_bonus(text: &str) -> f64 {
|
|
|
266
253
|
}
|
|
267
254
|
|
|
268
255
|
#[inline]
|
|
269
|
-
fn calculate_metadata_bonus(metadata: &HashMap<String,
|
|
256
|
+
fn calculate_metadata_bonus(metadata: &HashMap<String, String>) -> f64 {
|
|
270
257
|
const IMPORTANT_FIELDS: &[&str] = &["title", "author", "subject", "description", "keywords"];
|
|
271
258
|
|
|
272
259
|
let present_fields = IMPORTANT_FIELDS
|
|
@@ -492,8 +479,8 @@ mod tests {
|
|
|
492
479
|
fn test_calculate_quality_score_with_metadata() {
|
|
493
480
|
let text = "This is a normal text with proper structure.";
|
|
494
481
|
let mut metadata = HashMap::new();
|
|
495
|
-
metadata.insert("title".to_string(),
|
|
496
|
-
metadata.insert("author".to_string(),
|
|
482
|
+
metadata.insert("title".to_string(), "Test Title".to_string());
|
|
483
|
+
metadata.insert("author".to_string(), "Test Author".to_string());
|
|
497
484
|
|
|
498
485
|
let score = calculate_quality_score(text, Some(&metadata));
|
|
499
486
|
assert!(score > 0.0);
|
|
@@ -566,11 +553,11 @@ mod tests {
|
|
|
566
553
|
#[test]
|
|
567
554
|
fn test_calculate_metadata_bonus_full() {
|
|
568
555
|
let mut metadata = HashMap::new();
|
|
569
|
-
metadata.insert("title".to_string(),
|
|
570
|
-
metadata.insert("author".to_string(),
|
|
571
|
-
metadata.insert("subject".to_string(),
|
|
572
|
-
metadata.insert("description".to_string(),
|
|
573
|
-
metadata.insert("keywords".to_string(),
|
|
556
|
+
metadata.insert("title".to_string(), "Title".to_string());
|
|
557
|
+
metadata.insert("author".to_string(), "Author".to_string());
|
|
558
|
+
metadata.insert("subject".to_string(), "Subject".to_string());
|
|
559
|
+
metadata.insert("description".to_string(), "Description".to_string());
|
|
560
|
+
metadata.insert("keywords".to_string(), "Keywords".to_string());
|
|
574
561
|
|
|
575
562
|
let bonus = calculate_metadata_bonus(&metadata);
|
|
576
563
|
assert_eq!(bonus, 1.0);
|
|
@@ -45,9 +45,7 @@ fn calculate_cache_key(data: &[u8]) -> String {
|
|
|
45
45
|
let sample = if data.len() > 1024 { &data[..1024] } else { data };
|
|
46
46
|
sample.hash(&mut hasher);
|
|
47
47
|
data.len().hash(&mut hasher);
|
|
48
|
-
|
|
49
|
-
result.push_str(&format!("{:x}", hasher.finish()));
|
|
50
|
-
result
|
|
48
|
+
format!("{:x}", hasher.finish())
|
|
51
49
|
}
|
|
52
50
|
|
|
53
51
|
pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
|
|
@@ -59,7 +57,7 @@ pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
|
|
|
59
57
|
&& let Some(enc) = Encoding::for_label(enc_name.as_bytes())
|
|
60
58
|
{
|
|
61
59
|
let (decoded, _, _) = enc.decode(byte_data);
|
|
62
|
-
return fix_mojibake_internal(&decoded)
|
|
60
|
+
return fix_mojibake_internal(&decoded);
|
|
63
61
|
}
|
|
64
62
|
|
|
65
63
|
let cache_key = calculate_cache_key(byte_data);
|
|
@@ -68,7 +66,7 @@ pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
|
|
|
68
66
|
&& let Some(&cached_encoding) = cache.get(&cache_key)
|
|
69
67
|
{
|
|
70
68
|
let (decoded, _, _) = cached_encoding.decode(byte_data);
|
|
71
|
-
return fix_mojibake_internal(&decoded)
|
|
69
|
+
return fix_mojibake_internal(&decoded);
|
|
72
70
|
}
|
|
73
71
|
|
|
74
72
|
let mut detector = EncodingDetector::new();
|
|
@@ -95,23 +93,17 @@ pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
|
|
|
95
93
|
if let Some(enc) = Encoding::for_label(enc_name.as_bytes()) {
|
|
96
94
|
let (test_decoded, _, test_errors) = enc.decode(byte_data);
|
|
97
95
|
if !test_errors && calculate_text_confidence_internal(&test_decoded) > 0.5 {
|
|
98
|
-
return fix_mojibake_internal(&test_decoded)
|
|
96
|
+
return fix_mojibake_internal(&test_decoded);
|
|
99
97
|
}
|
|
100
98
|
}
|
|
101
99
|
}
|
|
102
100
|
}
|
|
103
101
|
|
|
104
|
-
fix_mojibake_internal(&decoded)
|
|
102
|
+
fix_mojibake_internal(&decoded)
|
|
105
103
|
}
|
|
106
104
|
|
|
107
105
|
pub fn get_encoding_cache_key(data_hash: &str, size: usize) -> String {
|
|
108
|
-
|
|
109
|
-
let mut result = String::with_capacity(estimated_capacity);
|
|
110
|
-
result.push_str(data_hash);
|
|
111
|
-
result.push(':');
|
|
112
|
-
use std::fmt::Write;
|
|
113
|
-
let _ = write!(result, "{}", size);
|
|
114
|
-
result
|
|
106
|
+
format!("{}:{}", data_hash, size)
|
|
115
107
|
}
|
|
116
108
|
|
|
117
109
|
pub fn calculate_text_confidence(text: &str) -> f64 {
|
|
@@ -149,16 +141,12 @@ fn calculate_text_confidence_internal(text: &str) -> f64 {
|
|
|
149
141
|
}
|
|
150
142
|
|
|
151
143
|
pub fn fix_mojibake(text: &str) -> String {
|
|
152
|
-
fix_mojibake_internal(text)
|
|
144
|
+
fix_mojibake_internal(text)
|
|
153
145
|
}
|
|
154
146
|
|
|
155
|
-
fn fix_mojibake_internal(text: &str) ->
|
|
147
|
+
fn fix_mojibake_internal(text: &str) -> String {
|
|
156
148
|
if text.is_empty() {
|
|
157
|
-
return
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
if !CONTROL_CHARS.is_match(text) && !REPLACEMENT_CHARS.is_match(text) && !ISOLATED_COMBINING.is_match(text) {
|
|
161
|
-
return Cow::Borrowed(text);
|
|
149
|
+
return text.to_string();
|
|
162
150
|
}
|
|
163
151
|
|
|
164
152
|
let replacements = [
|
|
@@ -167,7 +155,7 @@ fn fix_mojibake_internal(text: &str) -> Cow<'_, str> {
|
|
|
167
155
|
(&*ISOLATED_COMBINING, ""),
|
|
168
156
|
];
|
|
169
157
|
|
|
170
|
-
chain_replacements(Cow::Borrowed(text), &replacements)
|
|
158
|
+
chain_replacements(Cow::Borrowed(text), &replacements).into_owned()
|
|
171
159
|
}
|
|
172
160
|
|
|
173
161
|
#[cfg(test)]
|