kreuzberg 4.0.8 → 4.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +99 -2
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/spec/fixtures/config.toml +1 -1
- data/spec/fixtures/config.yaml +1 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +5 -4
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mime.rs +15 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +201 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -1,316 +1,197 @@
|
|
|
1
|
+
//! Quality scoring and text cleaning utilities
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides comprehensive quality assessment and cleaning
|
|
4
|
+
//! for extracted text, including OCR artifact detection, script removal,
|
|
5
|
+
//! and whitespace normalization.
|
|
6
|
+
|
|
7
|
+
mod heuristics;
|
|
8
|
+
mod patterns;
|
|
9
|
+
mod scoring;
|
|
10
|
+
|
|
11
|
+
// Re-export public API
|
|
12
|
+
pub use scoring::calculate_quality_score;
|
|
13
|
+
|
|
1
14
|
use crate::text::utf8_validation;
|
|
2
|
-
use ahash::AHashMap;
|
|
3
15
|
use memchr::{memchr, memchr3};
|
|
4
|
-
use
|
|
16
|
+
use patterns::*;
|
|
5
17
|
use regex::Regex;
|
|
6
18
|
use std::borrow::Cow;
|
|
7
19
|
|
|
8
20
|
// ============================================================================
|
|
21
|
+
// Text Cleaning and Normalization
|
|
9
22
|
// ============================================================================
|
|
10
23
|
|
|
11
|
-
|
|
12
|
-
const SCRIPT_PENALTY_WEIGHT: f64 = 0.2;
|
|
13
|
-
const NAV_PENALTY_WEIGHT: f64 = 0.1;
|
|
14
|
-
const STRUCTURE_BONUS_WEIGHT: f64 = 0.2;
|
|
15
|
-
const METADATA_BONUS_WEIGHT: f64 = 0.1;
|
|
16
|
-
|
|
17
|
-
const MIN_TEXT_LENGTH: usize = 10;
|
|
18
|
-
const LARGE_TEXT_LENGTH: usize = 1000;
|
|
19
|
-
const MIN_SENTENCE_WORDS: f64 = 10.0;
|
|
20
|
-
const MAX_SENTENCE_WORDS: f64 = 30.0;
|
|
21
|
-
const MIN_PARAGRAPH_WORDS: f64 = 50.0;
|
|
22
|
-
const MAX_PARAGRAPH_WORDS: f64 = 300.0;
|
|
23
|
-
|
|
24
|
-
static SCATTERED_CHARS_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
25
|
-
Regex::new(r"\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b")
|
|
26
|
-
.expect("Scattered chars regex pattern is valid and should compile")
|
|
27
|
-
});
|
|
28
|
-
static REPEATED_PUNCT_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
29
|
-
Regex::new(r"[.]{3,}|[_]{3,}").expect("Repeated punctuation regex pattern is valid and should compile")
|
|
30
|
-
});
|
|
31
|
-
static DASH_PATTERN: Lazy<Regex> =
|
|
32
|
-
Lazy::new(|| Regex::new(r"[-]{3,}").expect("Dash pattern regex is valid and should compile"));
|
|
33
|
-
static ISOLATED_PUNCT_PATTERN: Lazy<Regex> =
|
|
34
|
-
Lazy::new(|| Regex::new(r"\s[.,;:!?]\s").expect("Isolated punctuation regex pattern is valid and should compile"));
|
|
35
|
-
static MALFORMED_WORDS_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
36
|
-
Regex::new(r"\b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b")
|
|
37
|
-
.expect("Malformed words regex pattern is valid and should compile")
|
|
38
|
-
});
|
|
39
|
-
static EXCESSIVE_WHITESPACE_PATTERN: Lazy<Regex> =
|
|
40
|
-
Lazy::new(|| Regex::new(r"\s{3,}").expect("Excessive whitespace regex pattern is valid and should compile"));
|
|
41
|
-
|
|
42
|
-
static JS_FUNCTION_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
43
|
-
Regex::new(r"(?i)function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}")
|
|
44
|
-
.expect("JavaScript function regex pattern is valid and should compile")
|
|
45
|
-
});
|
|
46
|
-
static CSS_RULES_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
47
|
-
Regex::new(r"(?i)\.[a-zA-Z][\w-]*\s*\{[^}]*\}").expect("CSS rules regex pattern is valid and should compile")
|
|
48
|
-
});
|
|
49
|
-
static SCRIPT_TAG_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
50
|
-
Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("Script tag regex pattern is valid and should compile")
|
|
51
|
-
});
|
|
52
|
-
static STYLE_TAG_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
53
|
-
Regex::new(r"(?is)<style[^>]*>.*?</style>").expect("Style tag regex pattern is valid and should compile")
|
|
54
|
-
});
|
|
55
|
-
|
|
56
|
-
static NAV_WORDS_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
57
|
-
Regex::new(r"(?i)\b(?:Skip to main content|Back to top|Main navigation|Site navigation)\b")
|
|
58
|
-
.expect("Navigation words regex pattern is valid and should compile")
|
|
59
|
-
});
|
|
60
|
-
static BREADCRUMB_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
61
|
-
Regex::new(r"(?:Home\s*[>»]\s*|[>»]\s*){2,}").expect("Breadcrumb regex pattern is valid and should compile")
|
|
62
|
-
});
|
|
63
|
-
static PAGINATION_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
64
|
-
Regex::new(r"(?i)\b(?:Page \d+ of \d+|First page|Last page|Previous page|Next page|^\d+ of \d+$)\b")
|
|
65
|
-
.expect("Pagination regex pattern is valid and should compile")
|
|
66
|
-
});
|
|
67
|
-
|
|
68
|
-
static SENTENCE_DETECT: Lazy<Regex> =
|
|
69
|
-
Lazy::new(|| Regex::new(r"[.!?]\s+[A-Z]").expect("Sentence detection regex pattern is valid and should compile"));
|
|
70
|
-
static PUNCTUATION_DETECT: Lazy<Regex> =
|
|
71
|
-
Lazy::new(|| Regex::new(r"[.!?]").expect("Punctuation detection regex pattern is valid and should compile"));
|
|
72
|
-
|
|
73
|
-
static WHITESPACE_NORMALIZE: Lazy<Regex> = Lazy::new(|| {
|
|
74
|
-
Regex::new(r"[ \t\f\v\r\xa0\u{2000}-\u{200b}\u{2028}\u{2029}\u{3000}]+")
|
|
75
|
-
.expect("Whitespace normalization regex pattern is valid and should compile")
|
|
76
|
-
});
|
|
77
|
-
static NEWLINE_NORMALIZE: Lazy<Regex> = Lazy::new(|| {
|
|
78
|
-
Regex::new(r"\n\s*\n\s*\n+").expect("Newline normalization regex pattern is valid and should compile")
|
|
79
|
-
});
|
|
80
|
-
static NEWLINE_CLEANUP: Lazy<Regex> =
|
|
81
|
-
Lazy::new(|| Regex::new(r"\n+").expect("Newline cleanup regex pattern is valid and should compile"));
|
|
82
|
-
|
|
83
|
-
#[inline]
|
|
84
|
-
fn sum_match_lengths(text: &str, pattern: &Regex) -> usize {
|
|
85
|
-
pattern.find_iter(text).map(|m| m.len()).sum()
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
fn chain_replacements<'a>(mut text: Cow<'a, str>, replacements: &[(&Regex, &str)]) -> Cow<'a, str> {
|
|
89
|
-
for (pattern, replacement) in replacements {
|
|
90
|
-
if pattern.is_match(&text) {
|
|
91
|
-
text = Cow::Owned(pattern.replace_all(&text, *replacement).into_owned());
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
text
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
#[inline]
|
|
98
|
-
fn replace_with_if_matches<'a, F>(text: &'a str, pattern: &Regex, replacer: F) -> Cow<'a, str>
|
|
99
|
-
where
|
|
100
|
-
F: FnMut(®ex::Captures) -> String,
|
|
101
|
-
{
|
|
102
|
-
if pattern.is_match(text) {
|
|
103
|
-
Cow::Owned(pattern.replace_all(text, replacer).into_owned())
|
|
104
|
-
} else {
|
|
105
|
-
Cow::Borrowed(text)
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
/// Compute a heuristic score (0.0–1.0) describing how clean the extracted text is.
|
|
110
|
-
///
|
|
111
|
-
/// The scoring pipeline rewards well-structured prose while penalising OCR artefacts,
|
|
112
|
-
/// embedded scripts, and navigation chrome. Supplying document metadata allows the
|
|
113
|
-
/// function to include contextual bonuses.
|
|
114
|
-
///
|
|
115
|
-
/// ```rust
|
|
116
|
-
/// use ahash::AHashMap;
|
|
117
|
-
/// use kreuzberg::utils::quality::calculate_quality_score;
|
|
24
|
+
/// Apply the quality heuristics and return a cleaned representation of the text.
|
|
118
25
|
///
|
|
119
|
-
///
|
|
120
|
-
///
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
if text.is_empty() || text.trim().is_empty() {
|
|
125
|
-
return 0.0;
|
|
26
|
+
/// This function normalises whitespace, removes navigation boilerplate, and strips
|
|
27
|
+
/// repeated punctuation that commonly appears in OCR output.
|
|
28
|
+
pub fn clean_extracted_text(text: &str) -> String {
|
|
29
|
+
if text.is_empty() {
|
|
30
|
+
return String::new();
|
|
126
31
|
}
|
|
127
32
|
|
|
128
|
-
let
|
|
33
|
+
let mut working_text = Cow::Borrowed(text);
|
|
129
34
|
|
|
130
|
-
|
|
131
|
-
return 0.1;
|
|
132
|
-
}
|
|
35
|
+
working_text = clean_scripts(working_text);
|
|
133
36
|
|
|
134
|
-
|
|
37
|
+
working_text = clean_ocr_artifacts_cow(working_text);
|
|
135
38
|
|
|
136
|
-
|
|
137
|
-
let ocr_penalty = calculate_ocr_penalty(text, total_chars);
|
|
138
|
-
let script_penalty = calculate_script_penalty(text, total_chars);
|
|
139
|
-
let nav_penalty = calculate_navigation_penalty(text, total_chars);
|
|
140
|
-
let structure_bonus = calculate_structure_bonus(text);
|
|
39
|
+
working_text = clean_navigation_elements_cow(working_text);
|
|
141
40
|
|
|
142
|
-
|
|
143
|
-
score -= script_penalty * SCRIPT_PENALTY_WEIGHT;
|
|
144
|
-
score -= nav_penalty * NAV_PENALTY_WEIGHT;
|
|
145
|
-
score += structure_bonus * STRUCTURE_BONUS_WEIGHT;
|
|
146
|
-
} else {
|
|
147
|
-
score -= calculate_ocr_penalty(text, total_chars) * OCR_PENALTY_WEIGHT;
|
|
148
|
-
score += calculate_structure_bonus(text) * STRUCTURE_BONUS_WEIGHT;
|
|
149
|
-
}
|
|
41
|
+
working_text = clean_repeated_punctuation_cow(working_text);
|
|
150
42
|
|
|
151
|
-
|
|
152
|
-
score += calculate_metadata_bonus(metadata) * METADATA_BONUS_WEIGHT;
|
|
153
|
-
}
|
|
43
|
+
working_text = normalize_whitespace_cow(working_text);
|
|
154
44
|
|
|
155
|
-
|
|
45
|
+
working_text.trim().to_string()
|
|
156
46
|
}
|
|
157
47
|
|
|
158
|
-
|
|
159
|
-
fn
|
|
160
|
-
if
|
|
161
|
-
return
|
|
48
|
+
/// Collapse redundant whitespace while preserving paragraph boundaries.
|
|
49
|
+
pub fn normalize_spaces(text: &str) -> String {
|
|
50
|
+
if text.is_empty() || text.trim().is_empty() {
|
|
51
|
+
return String::new();
|
|
162
52
|
}
|
|
163
53
|
|
|
164
|
-
|
|
165
|
-
return 0.0;
|
|
166
|
-
}
|
|
54
|
+
let mut result = String::with_capacity(text.len());
|
|
167
55
|
|
|
168
|
-
let
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
56
|
+
let mut first = true;
|
|
57
|
+
for paragraph in text.split("\n\n") {
|
|
58
|
+
let trimmed = paragraph.trim();
|
|
59
|
+
if trimmed.is_empty() {
|
|
60
|
+
continue;
|
|
61
|
+
}
|
|
174
62
|
|
|
175
|
-
|
|
176
|
-
|
|
63
|
+
if !first {
|
|
64
|
+
result.push_str("\n\n");
|
|
65
|
+
}
|
|
66
|
+
first = false;
|
|
177
67
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
68
|
+
let collapsed = if let Some(fast) = normalize_whitespace_ascii(paragraph) {
|
|
69
|
+
Cow::Owned(fast)
|
|
70
|
+
} else {
|
|
71
|
+
Cow::Owned(WHITESPACE_NORMALIZE.replace_all(paragraph, " ").into_owned())
|
|
72
|
+
};
|
|
181
73
|
|
|
182
|
-
|
|
183
|
-
let trimmed = line.trim();
|
|
184
|
-
let is_table_separator = trimmed.starts_with('|')
|
|
185
|
-
&& trimmed.ends_with('|')
|
|
186
|
-
&& trimmed
|
|
187
|
-
.chars()
|
|
188
|
-
.all(|c| c == '|' || c == '-' || c.is_whitespace() || c == ':');
|
|
74
|
+
let cleaned = NEWLINE_CLEANUP.replace_all(&collapsed, "\n");
|
|
189
75
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
76
|
+
let mut first_line = true;
|
|
77
|
+
for line in cleaned.split('\n') {
|
|
78
|
+
let line = line.trim();
|
|
79
|
+
if !line.is_empty() {
|
|
80
|
+
if !first_line {
|
|
81
|
+
result.push('\n');
|
|
82
|
+
}
|
|
83
|
+
result.push_str(line);
|
|
84
|
+
first_line = false;
|
|
193
85
|
}
|
|
194
86
|
}
|
|
195
87
|
}
|
|
196
88
|
|
|
197
|
-
|
|
89
|
+
result
|
|
198
90
|
}
|
|
199
91
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
return 0.0;
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
if !text.contains("function") && !text.contains("<script") && !text.contains("<style") {
|
|
207
|
-
return 0.0;
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
let script_chars = sum_match_lengths(text, &JS_FUNCTION_PATTERN)
|
|
211
|
-
+ sum_match_lengths(text, &CSS_RULES_PATTERN)
|
|
212
|
-
+ sum_match_lengths(text, &SCRIPT_TAG_PATTERN)
|
|
213
|
-
+ sum_match_lengths(text, &STYLE_TAG_PATTERN);
|
|
92
|
+
// ============================================================================
|
|
93
|
+
// Internal Cleaning Functions
|
|
94
|
+
// ============================================================================
|
|
214
95
|
|
|
215
|
-
|
|
96
|
+
#[inline]
|
|
97
|
+
fn clean_scripts<'a>(text: Cow<'a, str>) -> Cow<'a, str> {
|
|
98
|
+
let script_replacements = [
|
|
99
|
+
(&*SCRIPT_TAG_PATTERN, " "),
|
|
100
|
+
(&*STYLE_TAG_PATTERN, " "),
|
|
101
|
+
(&*JS_FUNCTION_PATTERN, " "),
|
|
102
|
+
(&*CSS_RULES_PATTERN, " "),
|
|
103
|
+
];
|
|
104
|
+
chain_replacements(text, &script_replacements)
|
|
216
105
|
}
|
|
217
106
|
|
|
218
107
|
#[inline]
|
|
219
|
-
fn
|
|
220
|
-
if
|
|
221
|
-
|
|
222
|
-
}
|
|
108
|
+
fn clean_ocr_artifacts_cow<'a>(text: Cow<'a, str>) -> Cow<'a, str> {
|
|
109
|
+
let result = if let Some(fixed) = collapse_scattered_ascii(&text) {
|
|
110
|
+
Cow::Owned(fixed)
|
|
111
|
+
} else if SCATTERED_CHARS_PATTERN.is_match(&text) {
|
|
112
|
+
Cow::Owned(
|
|
113
|
+
replace_with_if_matches(&text, &SCATTERED_CHARS_PATTERN, |caps: ®ex::Captures| {
|
|
114
|
+
caps[0].chars().filter(|c| !c.is_whitespace()).collect::<String>()
|
|
115
|
+
})
|
|
116
|
+
.into_owned(),
|
|
117
|
+
)
|
|
118
|
+
} else {
|
|
119
|
+
text
|
|
120
|
+
};
|
|
223
121
|
|
|
224
|
-
let
|
|
225
|
-
+ sum_match_lengths(text, &BREADCRUMB_PATTERN)
|
|
226
|
-
+ sum_match_lengths(text, &PAGINATION_PATTERN);
|
|
122
|
+
let result = clean_dashes_preserve_tables(result);
|
|
227
123
|
|
|
228
|
-
|
|
124
|
+
let ocr_replacements = [
|
|
125
|
+
(&*REPEATED_PUNCT_PATTERN, "..."),
|
|
126
|
+
(&*ISOLATED_PUNCT_PATTERN, " "),
|
|
127
|
+
(&*MALFORMED_WORDS_PATTERN, " "),
|
|
128
|
+
(&*EXCESSIVE_WHITESPACE_PATTERN, " "),
|
|
129
|
+
];
|
|
130
|
+
|
|
131
|
+
chain_replacements(result, &ocr_replacements)
|
|
229
132
|
}
|
|
230
133
|
|
|
231
134
|
#[inline]
|
|
232
|
-
fn
|
|
233
|
-
if
|
|
234
|
-
return
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
let sentence_count = SENTENCE_DETECT.find_iter(text).count() as f64;
|
|
238
|
-
let paragraph_count = text.matches("\n\n").count() as f64 + 1.0;
|
|
239
|
-
let words = text.split_whitespace().count() as f64;
|
|
240
|
-
|
|
241
|
-
if words == 0.0 {
|
|
242
|
-
return 0.0;
|
|
135
|
+
fn clean_dashes_preserve_tables<'a>(text: Cow<'a, str>) -> Cow<'a, str> {
|
|
136
|
+
if !DASH_PATTERN.is_match(&text) {
|
|
137
|
+
return text;
|
|
243
138
|
}
|
|
244
139
|
|
|
245
|
-
let
|
|
246
|
-
let
|
|
247
|
-
|
|
248
|
-
let mut structure_score: f64 = 0.0;
|
|
249
|
-
|
|
250
|
-
if (MIN_SENTENCE_WORDS..=MAX_SENTENCE_WORDS).contains(&avg_words_per_sentence) {
|
|
251
|
-
structure_score += 0.3;
|
|
252
|
-
}
|
|
140
|
+
let mut result = String::with_capacity(text.len());
|
|
141
|
+
let lines: Vec<&str> = text.lines().collect();
|
|
253
142
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
143
|
+
for (i, line) in lines.iter().enumerate() {
|
|
144
|
+
if i > 0 {
|
|
145
|
+
result.push('\n');
|
|
146
|
+
}
|
|
257
147
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
148
|
+
let trimmed = line.trim();
|
|
149
|
+
let is_table_separator = trimmed.starts_with('|')
|
|
150
|
+
&& trimmed.ends_with('|')
|
|
151
|
+
&& trimmed
|
|
152
|
+
.chars()
|
|
153
|
+
.all(|c| c == '|' || c == '-' || c.is_whitespace() || c == ':');
|
|
261
154
|
|
|
262
|
-
|
|
263
|
-
|
|
155
|
+
if is_table_separator {
|
|
156
|
+
result.push_str(line);
|
|
157
|
+
} else {
|
|
158
|
+
let cleaned_line = DASH_PATTERN.replace_all(line, "...");
|
|
159
|
+
result.push_str(&cleaned_line);
|
|
160
|
+
}
|
|
264
161
|
}
|
|
265
162
|
|
|
266
|
-
|
|
163
|
+
Cow::Owned(result)
|
|
267
164
|
}
|
|
268
165
|
|
|
269
166
|
#[inline]
|
|
270
|
-
fn
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
.count();
|
|
167
|
+
fn clean_navigation_elements_cow<'a>(text: Cow<'a, str>) -> Cow<'a, str> {
|
|
168
|
+
let nav_replacements = [
|
|
169
|
+
(&*NAV_WORDS_PATTERN, " "),
|
|
170
|
+
(&*BREADCRUMB_PATTERN, " "),
|
|
171
|
+
(&*PAGINATION_PATTERN, " "),
|
|
172
|
+
];
|
|
277
173
|
|
|
278
|
-
|
|
174
|
+
chain_replacements(text, &nav_replacements)
|
|
279
175
|
}
|
|
280
176
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
pub fn clean_extracted_text(text: &str) -> String {
|
|
286
|
-
if text.is_empty() {
|
|
287
|
-
return String::new();
|
|
177
|
+
#[inline]
|
|
178
|
+
fn clean_repeated_punctuation_cow<'a>(text: Cow<'a, str>) -> Cow<'a, str> {
|
|
179
|
+
if let Some(cleaned) = clean_repeated_punctuation_ascii(text.as_ref()) {
|
|
180
|
+
return Cow::Owned(cleaned);
|
|
288
181
|
}
|
|
289
182
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
working_text.trim().to_string()
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
#[inline]
|
|
306
|
-
fn clean_scripts<'a>(text: Cow<'a, str>) -> Cow<'a, str> {
|
|
307
|
-
let script_replacements = [
|
|
308
|
-
(&*SCRIPT_TAG_PATTERN, " "),
|
|
309
|
-
(&*STYLE_TAG_PATTERN, " "),
|
|
310
|
-
(&*JS_FUNCTION_PATTERN, " "),
|
|
311
|
-
(&*CSS_RULES_PATTERN, " "),
|
|
312
|
-
];
|
|
313
|
-
chain_replacements(text, &script_replacements)
|
|
183
|
+
if REPEATED_PUNCT_PATTERN.is_match(&text) {
|
|
184
|
+
Cow::Owned(
|
|
185
|
+
REPEATED_PUNCT_PATTERN
|
|
186
|
+
.replace_all(&text, |caps: ®ex::Captures<'_>| {
|
|
187
|
+
let ch = caps.get(0).and_then(|m| m.as_str().chars().next()).unwrap_or('.');
|
|
188
|
+
ch.to_string()
|
|
189
|
+
})
|
|
190
|
+
.into_owned(),
|
|
191
|
+
)
|
|
192
|
+
} else {
|
|
193
|
+
text
|
|
194
|
+
}
|
|
314
195
|
}
|
|
315
196
|
|
|
316
197
|
#[inline]
|
|
@@ -332,25 +213,9 @@ fn normalize_whitespace_cow<'a>(text: Cow<'a, str>) -> Cow<'a, str> {
|
|
|
332
213
|
result
|
|
333
214
|
}
|
|
334
215
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
return Cow::Owned(cleaned);
|
|
339
|
-
}
|
|
340
|
-
|
|
341
|
-
if REPEATED_PUNCT_PATTERN.is_match(&text) {
|
|
342
|
-
Cow::Owned(
|
|
343
|
-
REPEATED_PUNCT_PATTERN
|
|
344
|
-
.replace_all(&text, |caps: ®ex::Captures<'_>| {
|
|
345
|
-
let ch = caps.get(0).and_then(|m| m.as_str().chars().next()).unwrap_or('.');
|
|
346
|
-
ch.to_string()
|
|
347
|
-
})
|
|
348
|
-
.into_owned(),
|
|
349
|
-
)
|
|
350
|
-
} else {
|
|
351
|
-
text
|
|
352
|
-
}
|
|
353
|
-
}
|
|
216
|
+
// ============================================================================
|
|
217
|
+
// ASCII Fast-Path Optimizations
|
|
218
|
+
// ============================================================================
|
|
354
219
|
|
|
355
220
|
fn clean_repeated_punctuation_ascii(text: &str) -> Option<String> {
|
|
356
221
|
if !text.is_ascii() {
|
|
@@ -407,6 +272,7 @@ fn find_next_ascii_punctuation(bytes: &[u8]) -> Option<usize> {
|
|
|
407
272
|
}
|
|
408
273
|
}
|
|
409
274
|
|
|
275
|
+
/// Normalize whitespace for ASCII text (fast path)
|
|
410
276
|
#[inline]
|
|
411
277
|
pub(crate) fn normalize_whitespace_ascii(text: &str) -> Option<String> {
|
|
412
278
|
if !text.is_ascii() {
|
|
@@ -471,76 +337,7 @@ pub(crate) fn normalize_whitespace_ascii(text: &str) -> Option<String> {
|
|
|
471
337
|
if changed { Some(normalized) } else { None }
|
|
472
338
|
}
|
|
473
339
|
|
|
474
|
-
|
|
475
|
-
fn clean_ocr_artifacts_cow<'a>(text: Cow<'a, str>) -> Cow<'a, str> {
|
|
476
|
-
let result = if let Some(fixed) = collapse_scattered_ascii(&text) {
|
|
477
|
-
Cow::Owned(fixed)
|
|
478
|
-
} else if SCATTERED_CHARS_PATTERN.is_match(&text) {
|
|
479
|
-
Cow::Owned(
|
|
480
|
-
replace_with_if_matches(&text, &SCATTERED_CHARS_PATTERN, |caps: ®ex::Captures| {
|
|
481
|
-
caps[0].chars().filter(|c| !c.is_whitespace()).collect::<String>()
|
|
482
|
-
})
|
|
483
|
-
.into_owned(),
|
|
484
|
-
)
|
|
485
|
-
} else {
|
|
486
|
-
text
|
|
487
|
-
};
|
|
488
|
-
|
|
489
|
-
let result = clean_dashes_preserve_tables(result);
|
|
490
|
-
|
|
491
|
-
let ocr_replacements = [
|
|
492
|
-
(&*REPEATED_PUNCT_PATTERN, "..."),
|
|
493
|
-
(&*ISOLATED_PUNCT_PATTERN, " "),
|
|
494
|
-
(&*MALFORMED_WORDS_PATTERN, " "),
|
|
495
|
-
(&*EXCESSIVE_WHITESPACE_PATTERN, " "),
|
|
496
|
-
];
|
|
497
|
-
|
|
498
|
-
chain_replacements(result, &ocr_replacements)
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
#[inline]
|
|
502
|
-
fn clean_dashes_preserve_tables<'a>(text: Cow<'a, str>) -> Cow<'a, str> {
|
|
503
|
-
if !DASH_PATTERN.is_match(&text) {
|
|
504
|
-
return text;
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
let mut result = String::with_capacity(text.len());
|
|
508
|
-
let lines: Vec<&str> = text.lines().collect();
|
|
509
|
-
|
|
510
|
-
for (i, line) in lines.iter().enumerate() {
|
|
511
|
-
if i > 0 {
|
|
512
|
-
result.push('\n');
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
let trimmed = line.trim();
|
|
516
|
-
let is_table_separator = trimmed.starts_with('|')
|
|
517
|
-
&& trimmed.ends_with('|')
|
|
518
|
-
&& trimmed
|
|
519
|
-
.chars()
|
|
520
|
-
.all(|c| c == '|' || c == '-' || c.is_whitespace() || c == ':');
|
|
521
|
-
|
|
522
|
-
if is_table_separator {
|
|
523
|
-
result.push_str(line);
|
|
524
|
-
} else {
|
|
525
|
-
let cleaned_line = DASH_PATTERN.replace_all(line, "...");
|
|
526
|
-
result.push_str(&cleaned_line);
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
|
|
530
|
-
Cow::Owned(result)
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
#[inline]
|
|
534
|
-
fn clean_navigation_elements_cow<'a>(text: Cow<'a, str>) -> Cow<'a, str> {
|
|
535
|
-
let nav_replacements = [
|
|
536
|
-
(&*NAV_WORDS_PATTERN, " "),
|
|
537
|
-
(&*BREADCRUMB_PATTERN, " "),
|
|
538
|
-
(&*PAGINATION_PATTERN, " "),
|
|
539
|
-
];
|
|
540
|
-
|
|
541
|
-
chain_replacements(text, &nav_replacements)
|
|
542
|
-
}
|
|
543
|
-
|
|
340
|
+
/// Collapse scattered ASCII characters (fast path)
|
|
544
341
|
#[inline]
|
|
545
342
|
pub(crate) fn collapse_scattered_ascii(text: &str) -> Option<String> {
|
|
546
343
|
if !text.is_ascii() {
|
|
@@ -591,53 +388,39 @@ pub(crate) fn collapse_scattered_ascii(text: &str) -> Option<String> {
|
|
|
591
388
|
}
|
|
592
389
|
}
|
|
593
390
|
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
return String::new();
|
|
598
|
-
}
|
|
599
|
-
|
|
600
|
-
let mut result = String::with_capacity(text.len());
|
|
601
|
-
|
|
602
|
-
let mut first = true;
|
|
603
|
-
for paragraph in text.split("\n\n") {
|
|
604
|
-
let trimmed = paragraph.trim();
|
|
605
|
-
if trimmed.is_empty() {
|
|
606
|
-
continue;
|
|
607
|
-
}
|
|
608
|
-
|
|
609
|
-
if !first {
|
|
610
|
-
result.push_str("\n\n");
|
|
611
|
-
}
|
|
612
|
-
first = false;
|
|
613
|
-
|
|
614
|
-
let collapsed = if let Some(fast) = normalize_whitespace_ascii(paragraph) {
|
|
615
|
-
Cow::Owned(fast)
|
|
616
|
-
} else {
|
|
617
|
-
Cow::Owned(WHITESPACE_NORMALIZE.replace_all(paragraph, " ").into_owned())
|
|
618
|
-
};
|
|
619
|
-
|
|
620
|
-
let cleaned = NEWLINE_CLEANUP.replace_all(&collapsed, "\n");
|
|
391
|
+
// ============================================================================
|
|
392
|
+
// Utility Functions
|
|
393
|
+
// ============================================================================
|
|
621
394
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
if !first_line {
|
|
627
|
-
result.push('\n');
|
|
628
|
-
}
|
|
629
|
-
result.push_str(line);
|
|
630
|
-
first_line = false;
|
|
631
|
-
}
|
|
395
|
+
fn chain_replacements<'a>(mut text: Cow<'a, str>, replacements: &[(&Regex, &str)]) -> Cow<'a, str> {
|
|
396
|
+
for (pattern, replacement) in replacements {
|
|
397
|
+
if pattern.is_match(&text) {
|
|
398
|
+
text = Cow::Owned(pattern.replace_all(&text, *replacement).into_owned());
|
|
632
399
|
}
|
|
633
400
|
}
|
|
401
|
+
text
|
|
402
|
+
}
|
|
634
403
|
|
|
635
|
-
|
|
404
|
+
#[inline]
|
|
405
|
+
fn replace_with_if_matches<'a, F>(text: &'a str, pattern: &Regex, replacer: F) -> Cow<'a, str>
|
|
406
|
+
where
|
|
407
|
+
F: FnMut(®ex::Captures) -> String,
|
|
408
|
+
{
|
|
409
|
+
if pattern.is_match(text) {
|
|
410
|
+
Cow::Owned(pattern.replace_all(text, replacer).into_owned())
|
|
411
|
+
} else {
|
|
412
|
+
Cow::Borrowed(text)
|
|
413
|
+
}
|
|
636
414
|
}
|
|
637
415
|
|
|
416
|
+
// ============================================================================
|
|
417
|
+
// Tests
|
|
418
|
+
// ============================================================================
|
|
419
|
+
|
|
638
420
|
#[cfg(all(test, feature = "quality"))]
|
|
639
421
|
mod tests {
|
|
640
422
|
use super::*;
|
|
423
|
+
use ahash::AHashMap;
|
|
641
424
|
|
|
642
425
|
#[test]
|
|
643
426
|
fn test_calculate_quality_score_empty_text() {
|
|
@@ -702,82 +485,6 @@ mod tests {
|
|
|
702
485
|
assert!(score <= 1.0);
|
|
703
486
|
}
|
|
704
487
|
|
|
705
|
-
#[test]
|
|
706
|
-
fn test_calculate_ocr_penalty_clean_text() {
|
|
707
|
-
let text = "This is clean text without artifacts";
|
|
708
|
-
let penalty = calculate_ocr_penalty(text, text.len() as f64);
|
|
709
|
-
assert_eq!(penalty, 0.0);
|
|
710
|
-
}
|
|
711
|
-
|
|
712
|
-
#[test]
|
|
713
|
-
fn test_calculate_ocr_penalty_with_artifacts() {
|
|
714
|
-
let text = "Text with excessive spaces and ....... dots";
|
|
715
|
-
let penalty = calculate_ocr_penalty(text, text.len() as f64);
|
|
716
|
-
assert!(penalty > 0.0);
|
|
717
|
-
assert!(penalty <= 1.0);
|
|
718
|
-
}
|
|
719
|
-
|
|
720
|
-
#[test]
|
|
721
|
-
fn test_calculate_script_penalty_clean_text() {
|
|
722
|
-
let text = "This is clean text without scripts";
|
|
723
|
-
let penalty = calculate_script_penalty(text, text.len() as f64);
|
|
724
|
-
assert_eq!(penalty, 0.0);
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
#[test]
|
|
728
|
-
fn test_calculate_script_penalty_with_js() {
|
|
729
|
-
let text = "function test() { return 42; }";
|
|
730
|
-
let penalty = calculate_script_penalty(text, text.len() as f64);
|
|
731
|
-
assert!(penalty > 0.0);
|
|
732
|
-
}
|
|
733
|
-
|
|
734
|
-
#[test]
|
|
735
|
-
fn test_calculate_navigation_penalty_clean_text() {
|
|
736
|
-
let text = "This is clean text without navigation";
|
|
737
|
-
let penalty = calculate_navigation_penalty(text, text.len() as f64);
|
|
738
|
-
assert_eq!(penalty, 0.0);
|
|
739
|
-
}
|
|
740
|
-
|
|
741
|
-
#[test]
|
|
742
|
-
fn test_calculate_navigation_penalty_with_nav() {
|
|
743
|
-
let text = "Skip to main content and Back to top links everywhere";
|
|
744
|
-
let penalty = calculate_navigation_penalty(text, text.len() as f64);
|
|
745
|
-
assert!(penalty > 0.0);
|
|
746
|
-
}
|
|
747
|
-
|
|
748
|
-
#[test]
|
|
749
|
-
fn test_calculate_structure_bonus_empty() {
|
|
750
|
-
assert_eq!(calculate_structure_bonus(""), 0.0);
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
#[test]
|
|
754
|
-
fn test_calculate_structure_bonus_well_structured() {
|
|
755
|
-
let text = "This is a sentence. This is another sentence.\n\nNew paragraph here. More content.";
|
|
756
|
-
let bonus = calculate_structure_bonus(text);
|
|
757
|
-
assert!(bonus > 0.0);
|
|
758
|
-
assert!(bonus <= 1.0);
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
#[test]
|
|
762
|
-
fn test_calculate_metadata_bonus_empty() {
|
|
763
|
-
let metadata = AHashMap::new();
|
|
764
|
-
let bonus = calculate_metadata_bonus(&metadata);
|
|
765
|
-
assert_eq!(bonus, 0.0);
|
|
766
|
-
}
|
|
767
|
-
|
|
768
|
-
#[test]
|
|
769
|
-
fn test_calculate_metadata_bonus_full() {
|
|
770
|
-
let mut metadata = AHashMap::new();
|
|
771
|
-
metadata.insert("title".to_string(), "Title".to_string());
|
|
772
|
-
metadata.insert("author".to_string(), "Author".to_string());
|
|
773
|
-
metadata.insert("subject".to_string(), "Subject".to_string());
|
|
774
|
-
metadata.insert("description".to_string(), "Description".to_string());
|
|
775
|
-
metadata.insert("keywords".to_string(), "Keywords".to_string());
|
|
776
|
-
|
|
777
|
-
let bonus = calculate_metadata_bonus(&metadata);
|
|
778
|
-
assert_eq!(bonus, 1.0);
|
|
779
|
-
}
|
|
780
|
-
|
|
781
488
|
#[test]
|
|
782
489
|
fn test_clean_extracted_text_removes_styles() {
|
|
783
490
|
let text = "Before <style>.class { color: red; }</style> After";
|
|
@@ -828,20 +535,6 @@ mod tests {
|
|
|
828
535
|
assert_eq!(normalized, "Para 1\n\nPara 2");
|
|
829
536
|
}
|
|
830
537
|
|
|
831
|
-
#[test]
|
|
832
|
-
fn test_count_non_table_dash_artifacts() {
|
|
833
|
-
let text = "Some text --- with dashes";
|
|
834
|
-
let count = count_non_table_dash_artifacts(text);
|
|
835
|
-
assert!(count > 0);
|
|
836
|
-
}
|
|
837
|
-
|
|
838
|
-
#[test]
|
|
839
|
-
fn test_count_non_table_dash_artifacts_preserves_tables() {
|
|
840
|
-
let text = "| Header |\n|--------|\n| Data |";
|
|
841
|
-
let count = count_non_table_dash_artifacts(text);
|
|
842
|
-
assert_eq!(count, 0);
|
|
843
|
-
}
|
|
844
|
-
|
|
845
538
|
#[test]
|
|
846
539
|
fn test_clean_dashes_preserve_tables_simple() {
|
|
847
540
|
let text = Cow::Borrowed("| Col1 |\n|------|\n| Data |");
|
|
@@ -857,13 +550,6 @@ mod tests {
|
|
|
857
550
|
assert!(!result.contains("---"));
|
|
858
551
|
}
|
|
859
552
|
|
|
860
|
-
#[test]
|
|
861
|
-
fn test_sum_match_lengths() {
|
|
862
|
-
let text = "test ... test ... test";
|
|
863
|
-
let count = sum_match_lengths(text, &REPEATED_PUNCT_PATTERN);
|
|
864
|
-
assert!(count > 0);
|
|
865
|
-
}
|
|
866
|
-
|
|
867
553
|
#[test]
|
|
868
554
|
fn test_quality_score_large_text_with_ocr_issues() {
|
|
869
555
|
let text = "a".repeat(2000) + " " + &"b".repeat(2000);
|
|
@@ -935,7 +621,7 @@ mod tests {
|
|
|
935
621
|
fn test_normalize_whitespace_cow_no_changes() {
|
|
936
622
|
let text = Cow::Borrowed("normaltext");
|
|
937
623
|
let result = normalize_whitespace_cow(text);
|
|
938
|
-
assert_eq!(result
|
|
624
|
+
assert_eq!(&*result, "normaltext");
|
|
939
625
|
}
|
|
940
626
|
|
|
941
627
|
#[test]
|
|
@@ -958,11 +644,4 @@ mod tests {
|
|
|
958
644
|
let result = clean_scripts(text);
|
|
959
645
|
assert!(!result.contains("<script"));
|
|
960
646
|
}
|
|
961
|
-
|
|
962
|
-
#[test]
|
|
963
|
-
fn test_quality_constants() {
|
|
964
|
-
assert_eq!(MIN_TEXT_LENGTH, 10);
|
|
965
|
-
assert_eq!(LARGE_TEXT_LENGTH, 1000);
|
|
966
|
-
assert_eq!(OCR_PENALTY_WEIGHT, 0.3);
|
|
967
|
-
}
|
|
968
647
|
}
|