kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
//! Germanic language stopwords.
|
|
2
|
+
//!
|
|
3
|
+
//! Includes: English (en), German (de), Dutch (nl), Swedish (sv),
|
|
4
|
+
//! Norwegian (no), Danish (da), Afrikaans (af).
|
|
5
|
+
|
|
6
|
+
use ahash::{AHashMap, AHashSet};
|
|
7
|
+
|
|
8
|
+
/// Macro to generate embedded stopwords for Germanic languages.
|
|
9
|
+
macro_rules! embed_stopwords {
|
|
10
|
+
($map:expr, $($lang:literal),* $(,)?) => {
|
|
11
|
+
$(
|
|
12
|
+
{
|
|
13
|
+
const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
|
|
14
|
+
match serde_json::from_str::<Vec<String>>(JSON) {
|
|
15
|
+
Ok(words) => {
|
|
16
|
+
let set: AHashSet<String> = words.into_iter().collect();
|
|
17
|
+
$map.insert($lang.to_string(), set);
|
|
18
|
+
}
|
|
19
|
+
Err(e) => {
|
|
20
|
+
panic!(
|
|
21
|
+
"Failed to parse embedded stopwords for language '{}': {}. \
|
|
22
|
+
This indicates corrupted or malformed JSON in the embedded stopwords data. \
|
|
23
|
+
Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
|
|
24
|
+
$lang, e
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
)*
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/// Load Germanic language stopwords into the provided map.
|
|
34
|
+
pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
|
|
35
|
+
embed_stopwords!(map, "en", "de", "nl", "sv", "no", "da", "af");
|
|
36
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
//! Language family modules for stopword loading.
|
|
2
|
+
//!
|
|
3
|
+
//! Stopwords are organized by language family for easier maintenance
|
|
4
|
+
//! and navigation. Each module handles loading stopwords for related languages.
|
|
5
|
+
|
|
6
|
+
pub(super) mod asian;
|
|
7
|
+
pub(super) mod germanic;
|
|
8
|
+
pub(super) mod other;
|
|
9
|
+
pub(super) mod romance;
|
|
10
|
+
pub(super) mod slavic;
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
//! Other language stopwords.
|
|
2
|
+
//!
|
|
3
|
+
//! Includes: Arabic (ar), Hebrew (he), Turkish (tr), Persian (fa),
|
|
4
|
+
//! Kurdish (ku), Armenian (hy), Estonian (et), Basque (eu),
|
|
5
|
+
//! Breton (br), Esperanto (eo), Finnish (fi), Irish (ga),
|
|
6
|
+
//! Hungarian (hu), Indonesian (id), Latin (la), Lithuanian (lt),
|
|
7
|
+
//! Latvian (lv), Malay (ms), Tagalog (tl), Greek (el),
|
|
8
|
+
//! Hausa (ha), Swahili (sw), Yoruba (yo), Zulu (zu),
|
|
9
|
+
//! Somali (so), Sesotho (st).
|
|
10
|
+
|
|
11
|
+
use ahash::{AHashMap, AHashSet};
|
|
12
|
+
|
|
13
|
+
/// Macro to generate embedded stopwords for other languages.
|
|
14
|
+
macro_rules! embed_stopwords {
|
|
15
|
+
($map:expr, $($lang:literal),* $(,)?) => {
|
|
16
|
+
$(
|
|
17
|
+
{
|
|
18
|
+
const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
|
|
19
|
+
match serde_json::from_str::<Vec<String>>(JSON) {
|
|
20
|
+
Ok(words) => {
|
|
21
|
+
let set: AHashSet<String> = words.into_iter().collect();
|
|
22
|
+
$map.insert($lang.to_string(), set);
|
|
23
|
+
}
|
|
24
|
+
Err(e) => {
|
|
25
|
+
panic!(
|
|
26
|
+
"Failed to parse embedded stopwords for language '{}': {}. \
|
|
27
|
+
This indicates corrupted or malformed JSON in the embedded stopwords data. \
|
|
28
|
+
Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
|
|
29
|
+
$lang, e
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
)*
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/// Load other language stopwords into the provided map.
|
|
39
|
+
pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
|
|
40
|
+
embed_stopwords!(
|
|
41
|
+
map, "ar", "he", "tr", "fa", "ku", "hy", "et", "eu", "br", "eo", "fi", "ga", "hu", "id", "la", "lt", "lv",
|
|
42
|
+
"ms", "tl", "el", "ha", "sw", "yo", "zu", "so", "st"
|
|
43
|
+
);
|
|
44
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
//! Romance language stopwords.
|
|
2
|
+
//!
|
|
3
|
+
//! Includes: French (fr), Spanish (es), Italian (it), Portuguese (pt),
|
|
4
|
+
//! Romanian (ro), Catalan (ca), Galician (gl).
|
|
5
|
+
|
|
6
|
+
use ahash::{AHashMap, AHashSet};
|
|
7
|
+
|
|
8
|
+
/// Macro to generate embedded stopwords for Romance languages.
|
|
9
|
+
macro_rules! embed_stopwords {
|
|
10
|
+
($map:expr, $($lang:literal),* $(,)?) => {
|
|
11
|
+
$(
|
|
12
|
+
{
|
|
13
|
+
const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
|
|
14
|
+
match serde_json::from_str::<Vec<String>>(JSON) {
|
|
15
|
+
Ok(words) => {
|
|
16
|
+
let set: AHashSet<String> = words.into_iter().collect();
|
|
17
|
+
$map.insert($lang.to_string(), set);
|
|
18
|
+
}
|
|
19
|
+
Err(e) => {
|
|
20
|
+
panic!(
|
|
21
|
+
"Failed to parse embedded stopwords for language '{}': {}. \
|
|
22
|
+
This indicates corrupted or malformed JSON in the embedded stopwords data. \
|
|
23
|
+
Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
|
|
24
|
+
$lang, e
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
)*
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/// Load Romance language stopwords into the provided map.
|
|
34
|
+
pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
|
|
35
|
+
embed_stopwords!(map, "fr", "es", "it", "pt", "ro", "ca", "gl");
|
|
36
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
//! Slavic language stopwords.
|
|
2
|
+
//!
|
|
3
|
+
//! Includes: Russian (ru), Polish (pl), Czech (cs), Ukrainian (uk),
|
|
4
|
+
//! Bulgarian (bg), Slovak (sk), Croatian (hr), Slovenian (sl).
|
|
5
|
+
|
|
6
|
+
use ahash::{AHashMap, AHashSet};
|
|
7
|
+
|
|
8
|
+
/// Macro to generate embedded stopwords for Slavic languages.
|
|
9
|
+
macro_rules! embed_stopwords {
|
|
10
|
+
($map:expr, $($lang:literal),* $(,)?) => {
|
|
11
|
+
$(
|
|
12
|
+
{
|
|
13
|
+
const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
|
|
14
|
+
match serde_json::from_str::<Vec<String>>(JSON) {
|
|
15
|
+
Ok(words) => {
|
|
16
|
+
let set: AHashSet<String> = words.into_iter().collect();
|
|
17
|
+
$map.insert($lang.to_string(), set);
|
|
18
|
+
}
|
|
19
|
+
Err(e) => {
|
|
20
|
+
panic!(
|
|
21
|
+
"Failed to parse embedded stopwords for language '{}': {}. \
|
|
22
|
+
This indicates corrupted or malformed JSON in the embedded stopwords data. \
|
|
23
|
+
Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
|
|
24
|
+
$lang, e
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
)*
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/// Load Slavic language stopwords into the provided map.
|
|
34
|
+
pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
|
|
35
|
+
embed_stopwords!(map, "ru", "pl", "cs", "uk", "bg", "sk", "hr", "sl");
|
|
36
|
+
}
|
|
@@ -82,33 +82,7 @@
|
|
|
82
82
|
use ahash::{AHashMap, AHashSet};
|
|
83
83
|
use once_cell::sync::Lazy;
|
|
84
84
|
|
|
85
|
-
|
|
86
|
-
///
|
|
87
|
-
/// This macro embeds the JSON files at compile time using `include_str!()` and
|
|
88
|
-
/// generates code to parse and insert them into the stopwords map.
|
|
89
|
-
macro_rules! embed_stopwords {
|
|
90
|
-
($map:expr, $($lang:literal),* $(,)?) => {
|
|
91
|
-
$(
|
|
92
|
-
{
|
|
93
|
-
const JSON: &str = include_str!(concat!("../../stopwords/", $lang, "_stopwords.json"));
|
|
94
|
-
match serde_json::from_str::<Vec<String>>(JSON) {
|
|
95
|
-
Ok(words) => {
|
|
96
|
-
let set: AHashSet<String> = words.into_iter().collect();
|
|
97
|
-
$map.insert($lang.to_string(), set);
|
|
98
|
-
}
|
|
99
|
-
Err(e) => {
|
|
100
|
-
panic!(
|
|
101
|
-
"Failed to parse embedded stopwords for language '{}': {}. \
|
|
102
|
-
This indicates corrupted or malformed JSON in the embedded stopwords data. \
|
|
103
|
-
Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
|
|
104
|
-
$lang, e
|
|
105
|
-
);
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
)*
|
|
110
|
-
};
|
|
111
|
-
}
|
|
85
|
+
mod languages;
|
|
112
86
|
|
|
113
87
|
/// Global stopwords registry.
|
|
114
88
|
///
|
|
@@ -146,12 +120,12 @@ macro_rules! embed_stopwords {
|
|
|
146
120
|
pub static STOPWORDS: Lazy<AHashMap<String, AHashSet<String>>> = Lazy::new(|| {
|
|
147
121
|
let mut map = AHashMap::new();
|
|
148
122
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
);
|
|
123
|
+
// Load stopwords by language family
|
|
124
|
+
languages::germanic::load_stopwords(&mut map);
|
|
125
|
+
languages::romance::load_stopwords(&mut map);
|
|
126
|
+
languages::slavic::load_stopwords(&mut map);
|
|
127
|
+
languages::asian::load_stopwords(&mut map);
|
|
128
|
+
languages::other::load_stopwords(&mut map);
|
|
155
129
|
|
|
156
130
|
apply_stopword_whitelist(&mut map);
|
|
157
131
|
|
|
@@ -677,7 +677,7 @@ mod tests {
|
|
|
677
677
|
fn test_normalize_whitespace_cow_no_changes() {
|
|
678
678
|
let text = Cow::Borrowed("normaltext");
|
|
679
679
|
let result = normalize_whitespace_cow(text);
|
|
680
|
-
assert_eq!(result
|
|
680
|
+
assert_eq!(&*result, "normaltext");
|
|
681
681
|
}
|
|
682
682
|
|
|
683
683
|
#[test]
|
|
@@ -123,6 +123,8 @@ mod tests {
|
|
|
123
123
|
chunks: None,
|
|
124
124
|
images: None,
|
|
125
125
|
pages: None,
|
|
126
|
+
elements: None,
|
|
127
|
+
djot_content: None,
|
|
126
128
|
};
|
|
127
129
|
|
|
128
130
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -148,7 +150,9 @@ mod tests {
|
|
|
148
150
|
detected_languages: None,
|
|
149
151
|
chunks: None,
|
|
150
152
|
images: None,
|
|
153
|
+
djot_content: None,
|
|
151
154
|
pages: None,
|
|
155
|
+
elements: None,
|
|
152
156
|
};
|
|
153
157
|
|
|
154
158
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -181,7 +185,9 @@ mod tests {
|
|
|
181
185
|
detected_languages: None,
|
|
182
186
|
chunks: None,
|
|
183
187
|
images: None,
|
|
188
|
+
djot_content: None,
|
|
184
189
|
pages: None,
|
|
190
|
+
elements: None,
|
|
185
191
|
};
|
|
186
192
|
|
|
187
193
|
let config_with_quality = ExtractionConfig {
|
|
@@ -209,7 +215,9 @@ mod tests {
|
|
|
209
215
|
detected_languages: None,
|
|
210
216
|
chunks: None,
|
|
211
217
|
images: None,
|
|
218
|
+
djot_content: None,
|
|
212
219
|
pages: None,
|
|
220
|
+
elements: None,
|
|
213
221
|
};
|
|
214
222
|
|
|
215
223
|
let long_result = ExtractionResult {
|
|
@@ -220,7 +228,9 @@ mod tests {
|
|
|
220
228
|
detected_languages: None,
|
|
221
229
|
chunks: None,
|
|
222
230
|
images: None,
|
|
231
|
+
djot_content: None,
|
|
223
232
|
pages: None,
|
|
233
|
+
elements: None,
|
|
224
234
|
};
|
|
225
235
|
|
|
226
236
|
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
use ahash::AHashMap;
|
|
2
|
+
|
|
3
|
+
/// Bonus added for sentences at the beginning or end of the document
|
|
4
|
+
const SENTENCE_EDGE_POSITION_BONUS: f32 = 0.3;
|
|
5
|
+
|
|
6
|
+
/// Bonus added for sentences with ideal word count (neither too short nor too long)
|
|
7
|
+
const IDEAL_WORD_COUNT_BONUS: f32 = 0.2;
|
|
8
|
+
|
|
9
|
+
/// Minimum word count for ideal sentence length
|
|
10
|
+
const MIN_IDEAL_WORD_COUNT: usize = 3;
|
|
11
|
+
|
|
12
|
+
/// Maximum word count for ideal sentence length
|
|
13
|
+
const MAX_IDEAL_WORD_COUNT: usize = 25;
|
|
14
|
+
|
|
15
|
+
/// Weight multiplier for numeric content density in sentences
|
|
16
|
+
const NUMERIC_CONTENT_WEIGHT: f32 = 0.3;
|
|
17
|
+
|
|
18
|
+
/// Weight multiplier for capitalized/acronym word density in sentences
|
|
19
|
+
const CAPS_ACRONYM_WEIGHT: f32 = 0.25;
|
|
20
|
+
|
|
21
|
+
/// Weight multiplier for long word density in sentences
|
|
22
|
+
const LONG_WORD_WEIGHT: f32 = 0.2;
|
|
23
|
+
|
|
24
|
+
/// Minimum character length for a word to be considered "long"
|
|
25
|
+
const LONG_WORD_THRESHOLD: usize = 8;
|
|
26
|
+
|
|
27
|
+
/// Weight multiplier for punctuation density in sentences
|
|
28
|
+
const PUNCTUATION_DENSITY_WEIGHT: f32 = 0.15;
|
|
29
|
+
|
|
30
|
+
/// Weight multiplier for word diversity ratio (unique words / total words)
|
|
31
|
+
const DIVERSITY_RATIO_WEIGHT: f32 = 0.15;
|
|
32
|
+
|
|
33
|
+
/// Weight multiplier for character entropy (measure of text randomness/information)
|
|
34
|
+
const CHAR_ENTROPY_WEIGHT: f32 = 0.1;
|
|
35
|
+
|
|
36
|
+
/// Analyzes text characteristics and scores content importance.
|
|
37
|
+
pub struct TextAnalyzer;
|
|
38
|
+
|
|
39
|
+
impl TextAnalyzer {
|
|
40
|
+
/// Scores the importance of a sentence based on various characteristics.
|
|
41
|
+
pub fn score_sentence_importance(sentence: &str, position: usize, total_sentences: usize) -> f32 {
|
|
42
|
+
let mut score = 0.0;
|
|
43
|
+
|
|
44
|
+
if position == 0 || position == total_sentences - 1 {
|
|
45
|
+
score += SENTENCE_EDGE_POSITION_BONUS;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
let words: Vec<&str> = sentence.split_whitespace().collect();
|
|
49
|
+
if words.is_empty() {
|
|
50
|
+
return score;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
let word_count = words.len();
|
|
54
|
+
if (MIN_IDEAL_WORD_COUNT..=MAX_IDEAL_WORD_COUNT).contains(&word_count) {
|
|
55
|
+
score += IDEAL_WORD_COUNT_BONUS;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
let mut numeric_count = 0;
|
|
59
|
+
let mut caps_count = 0;
|
|
60
|
+
let mut long_word_count = 0;
|
|
61
|
+
let mut punct_density = 0;
|
|
62
|
+
|
|
63
|
+
for word in &words {
|
|
64
|
+
if word.chars().any(|c| c.is_numeric()) {
|
|
65
|
+
numeric_count += 1;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if word.len() > 1 && word.chars().all(|c| c.is_uppercase()) {
|
|
69
|
+
caps_count += 1;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if word.len() > LONG_WORD_THRESHOLD {
|
|
73
|
+
long_word_count += 1;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
punct_density += word.chars().filter(|c| c.is_ascii_punctuation()).count();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
score += (numeric_count as f32 / words.len() as f32) * NUMERIC_CONTENT_WEIGHT;
|
|
80
|
+
score += (caps_count as f32 / words.len() as f32) * CAPS_ACRONYM_WEIGHT;
|
|
81
|
+
score += (long_word_count as f32 / words.len() as f32) * LONG_WORD_WEIGHT;
|
|
82
|
+
score += (punct_density as f32 / sentence.len() as f32) * PUNCTUATION_DENSITY_WEIGHT;
|
|
83
|
+
|
|
84
|
+
let estimated_unique = (words.len() as f32 * 0.6).ceil() as usize;
|
|
85
|
+
let mut unique_words: ahash::AHashSet<String> = ahash::AHashSet::with_capacity(estimated_unique.max(10));
|
|
86
|
+
|
|
87
|
+
for w in &words {
|
|
88
|
+
let clean = w
|
|
89
|
+
.chars()
|
|
90
|
+
.filter(|c| c.is_alphabetic())
|
|
91
|
+
.collect::<String>()
|
|
92
|
+
.to_lowercase();
|
|
93
|
+
unique_words.insert(clean);
|
|
94
|
+
|
|
95
|
+
if unique_words.len() >= estimated_unique {
|
|
96
|
+
break;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
let final_unique_count = if unique_words.len() >= estimated_unique {
|
|
101
|
+
unique_words.len()
|
|
102
|
+
} else {
|
|
103
|
+
for w in &words {
|
|
104
|
+
let clean = w
|
|
105
|
+
.chars()
|
|
106
|
+
.filter(|c| c.is_alphabetic())
|
|
107
|
+
.collect::<String>()
|
|
108
|
+
.to_lowercase();
|
|
109
|
+
unique_words.insert(clean);
|
|
110
|
+
}
|
|
111
|
+
unique_words.len()
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
let diversity_ratio = final_unique_count as f32 / words.len() as f32;
|
|
115
|
+
score += diversity_ratio * DIVERSITY_RATIO_WEIGHT;
|
|
116
|
+
|
|
117
|
+
let char_entropy = Self::calculate_char_entropy(sentence);
|
|
118
|
+
score += char_entropy * CHAR_ENTROPY_WEIGHT;
|
|
119
|
+
|
|
120
|
+
score
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/// Calculates character entropy (measure of text randomness/information content).
|
|
124
|
+
pub fn calculate_char_entropy(text: &str) -> f32 {
|
|
125
|
+
let chars: Vec<char> = text.chars().collect();
|
|
126
|
+
if chars.is_empty() {
|
|
127
|
+
return 0.0;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
let estimated_unique = (chars.len() as f32 * 0.1).ceil() as usize;
|
|
131
|
+
let mut char_freq = AHashMap::with_capacity(estimated_unique.max(26));
|
|
132
|
+
|
|
133
|
+
for &ch in &chars {
|
|
134
|
+
let lowercase_ch = ch
|
|
135
|
+
.to_lowercase()
|
|
136
|
+
.next()
|
|
137
|
+
.expect("to_lowercase() must yield at least one character for valid Unicode");
|
|
138
|
+
*char_freq.entry(lowercase_ch).or_insert(0) += 1;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
let total_chars = chars.len() as f32;
|
|
142
|
+
char_freq
|
|
143
|
+
.values()
|
|
144
|
+
.map(|&freq| {
|
|
145
|
+
let p = freq as f32 / total_chars;
|
|
146
|
+
if p > 0.0 { -p * p.log2() } else { 0.0 }
|
|
147
|
+
})
|
|
148
|
+
.sum::<f32>()
|
|
149
|
+
.min(5.0)
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/// Checks if a word has important characteristics that should be preserved.
|
|
153
|
+
pub fn has_important_characteristics(word: &str) -> bool {
|
|
154
|
+
if word.len() > 1 && word.chars().all(|c| c.is_uppercase()) {
|
|
155
|
+
return true;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if word.chars().any(|c| c.is_numeric()) {
|
|
159
|
+
return true;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if word.len() > 10 {
|
|
163
|
+
return true;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
let uppercase_count = word.chars().filter(|c| c.is_uppercase()).count();
|
|
167
|
+
if uppercase_count > 1 && uppercase_count < word.len() {
|
|
168
|
+
return true;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
if Self::has_cjk_importance(word) {
|
|
172
|
+
return true;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
false
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/// Checks if a CJK word has important characteristics.
|
|
179
|
+
pub fn has_cjk_importance(word: &str) -> bool {
|
|
180
|
+
let chars: Vec<char> = word.chars().collect();
|
|
181
|
+
|
|
182
|
+
let has_cjk = chars.iter().any(|&c| c as u32 >= 0x4E00 && (c as u32) <= 0x9FFF);
|
|
183
|
+
if !has_cjk {
|
|
184
|
+
return false;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
let important_radicals = [
|
|
188
|
+
'学', '智', '能', '技', '术', '法', '算', '理', '科', '研', '究', '发', '展', '系', '统', '模', '型', '方',
|
|
189
|
+
'式', '过', '程', '结', '构', '功', '效', '应', '分', '析', '计', '算', '数', '据', '信', '息', '处', '理',
|
|
190
|
+
'语', '言', '文', '生', '成', '产', '用', '作', '为', '成', '变', '化', '转', '换', '提', '高', '网', '络',
|
|
191
|
+
'神', '经', '机', '器', '人', '工', '智', '能', '自', '然', '复',
|
|
192
|
+
];
|
|
193
|
+
|
|
194
|
+
for &char in &chars {
|
|
195
|
+
if important_radicals.contains(&char) {
|
|
196
|
+
return true;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if chars.len() == 2 && has_cjk {
|
|
201
|
+
let has_technical = chars.iter().any(|&c| {
|
|
202
|
+
let code = c as u32;
|
|
203
|
+
(0x4E00..=0x4FFF).contains(&code)
|
|
204
|
+
|| (0x5000..=0x51FF).contains(&code)
|
|
205
|
+
|| (0x6700..=0x68FF).contains(&code)
|
|
206
|
+
|| (0x7500..=0x76FF).contains(&code)
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
if has_technical {
|
|
210
|
+
return true;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
false
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
#[cfg(test)]
|
|
219
|
+
mod tests {
|
|
220
|
+
use super::*;
|
|
221
|
+
|
|
222
|
+
#[test]
|
|
223
|
+
fn test_calculate_char_entropy() {
|
|
224
|
+
let low_entropy = TextAnalyzer::calculate_char_entropy("aaaaaaa");
|
|
225
|
+
assert!(low_entropy < 1.0);
|
|
226
|
+
|
|
227
|
+
let high_entropy = TextAnalyzer::calculate_char_entropy("abcdefg123");
|
|
228
|
+
assert!(high_entropy > low_entropy);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
#[test]
|
|
232
|
+
fn test_important_word_characteristics() {
|
|
233
|
+
assert!(TextAnalyzer::has_important_characteristics("IMPORTANT"));
|
|
234
|
+
assert!(TextAnalyzer::has_important_characteristics("COVID-19"));
|
|
235
|
+
assert!(TextAnalyzer::has_important_characteristics("PyTorch"));
|
|
236
|
+
assert!(TextAnalyzer::has_important_characteristics("verylongword123"));
|
|
237
|
+
}
|
|
238
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
use once_cell::sync::Lazy;
|
|
2
|
+
use regex::Regex;
|
|
3
|
+
use std::borrow::Cow;
|
|
4
|
+
|
|
5
|
+
static REPEATED_EXCLAMATION: Lazy<Regex> =
|
|
6
|
+
Lazy::new(|| Regex::new(r"[!]{2,}").expect("Repeated exclamation regex pattern is valid and should compile"));
|
|
7
|
+
static REPEATED_QUESTION: Lazy<Regex> =
|
|
8
|
+
Lazy::new(|| Regex::new(r"[?]{2,}").expect("Repeated question regex pattern is valid and should compile"));
|
|
9
|
+
static REPEATED_COMMA: Lazy<Regex> =
|
|
10
|
+
Lazy::new(|| Regex::new(r"[,]{2,}").expect("Repeated comma regex pattern is valid and should compile"));
|
|
11
|
+
|
|
12
|
+
/// Handles punctuation cleaning and normalization.
|
|
13
|
+
pub struct PunctuationCleaner;
|
|
14
|
+
|
|
15
|
+
impl PunctuationCleaner {
|
|
16
|
+
/// Cleans excessive punctuation from text using optimized Cow pattern.
|
|
17
|
+
pub fn clean_punctuation_optimized(text: &str) -> String {
|
|
18
|
+
let mut result = Cow::Borrowed(text);
|
|
19
|
+
|
|
20
|
+
if REPEATED_EXCLAMATION.is_match(&result) {
|
|
21
|
+
result = Cow::Owned(REPEATED_EXCLAMATION.replace_all(&result, "!").into_owned());
|
|
22
|
+
}
|
|
23
|
+
if REPEATED_QUESTION.is_match(&result) {
|
|
24
|
+
result = Cow::Owned(REPEATED_QUESTION.replace_all(&result, "?").into_owned());
|
|
25
|
+
}
|
|
26
|
+
if REPEATED_COMMA.is_match(&result) {
|
|
27
|
+
result = Cow::Owned(REPEATED_COMMA.replace_all(&result, ",").into_owned());
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
result.into_owned()
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
#[cfg(test)]
|
|
35
|
+
mod tests {
|
|
36
|
+
use super::*;
|
|
37
|
+
|
|
38
|
+
#[test]
|
|
39
|
+
fn test_punctuation_normalization() {
|
|
40
|
+
let input = "Text!!!!!! with????? excessive,,,,,, punctuation";
|
|
41
|
+
let result = PunctuationCleaner::clean_punctuation_optimized(input);
|
|
42
|
+
|
|
43
|
+
assert!(!result.contains("!!!!!!"));
|
|
44
|
+
assert!(!result.contains("?????"));
|
|
45
|
+
assert!(!result.contains(",,,,,,"));
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#[test]
|
|
49
|
+
fn test_punctuation_no_change() {
|
|
50
|
+
let input = "Text with normal punctuation!";
|
|
51
|
+
let result = PunctuationCleaner::clean_punctuation_optimized(input);
|
|
52
|
+
assert_eq!(result, input);
|
|
53
|
+
}
|
|
54
|
+
}
|