kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -1,27 +1,29 @@
|
|
|
1
1
|
use crate::error::{KreuzbergError, Result};
|
|
2
2
|
use crate::stopwords::STOPWORDS;
|
|
3
3
|
use crate::text::token_reduction::config::TokenReductionConfig;
|
|
4
|
-
use crate::text::utf8_validation;
|
|
5
4
|
use ahash::{AHashMap, AHashSet};
|
|
6
|
-
use once_cell::sync::Lazy;
|
|
7
5
|
use regex::Regex;
|
|
8
6
|
use std::sync::Arc;
|
|
9
7
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
8
|
+
// Import filter modules
|
|
9
|
+
mod general;
|
|
10
|
+
mod html;
|
|
11
|
+
mod markdown;
|
|
12
|
+
|
|
13
|
+
// Re-export all filter functions for backward compatibility
|
|
14
|
+
pub use general::{normalize_newlines, normalize_spaces, remove_stopwords};
|
|
15
|
+
pub use html::remove_html_comments;
|
|
16
|
+
pub use markdown::{
|
|
17
|
+
extract_and_preserve_code, is_markdown_header, is_markdown_list, is_markdown_table, preserve_markdown_structure,
|
|
18
|
+
restore_preserved_blocks,
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
/// Main filter pipeline orchestrator that coordinates various text filtering operations.
|
|
22
|
+
///
|
|
23
|
+
/// The `FilterPipeline` provides a high-level interface for applying different levels
|
|
24
|
+
/// of text filtering, from light cleaning (HTML comments, whitespace) to moderate
|
|
25
|
+
/// filtering (stopword removal) while respecting preservation rules for code,
|
|
26
|
+
/// markdown, and custom patterns.
|
|
25
27
|
pub struct FilterPipeline {
|
|
26
28
|
config: Arc<TokenReductionConfig>,
|
|
27
29
|
stopwords: AHashSet<String>,
|
|
@@ -30,6 +32,17 @@ pub struct FilterPipeline {
|
|
|
30
32
|
}
|
|
31
33
|
|
|
32
34
|
impl FilterPipeline {
|
|
35
|
+
/// Creates a new `FilterPipeline` with the specified configuration and language.
|
|
36
|
+
///
|
|
37
|
+
/// # Arguments
|
|
38
|
+
/// * `config` - Token reduction configuration
|
|
39
|
+
/// * `language` - Language code for stopword selection (e.g., "en", "es", "de")
|
|
40
|
+
///
|
|
41
|
+
/// # Returns
|
|
42
|
+
/// A `Result` containing the new `FilterPipeline` or an error if regex patterns are invalid
|
|
43
|
+
///
|
|
44
|
+
/// # Errors
|
|
45
|
+
/// Returns a `KreuzbergError::Validation` if any preserve patterns are invalid regex
|
|
33
46
|
pub fn new(config: &Arc<TokenReductionConfig>, language: &str) -> Result<Self> {
|
|
34
47
|
let mut stopwords = STOPWORDS.get(language).cloned().unwrap_or_else(|| {
|
|
35
48
|
STOPWORDS
|
|
@@ -63,256 +76,145 @@ impl FilterPipeline {
|
|
|
63
76
|
})
|
|
64
77
|
}
|
|
65
78
|
|
|
79
|
+
/// Applies light filtering to text, removing HTML comments and normalizing whitespace.
|
|
80
|
+
///
|
|
81
|
+
/// Light filters include:
|
|
82
|
+
/// - HTML comment removal
|
|
83
|
+
/// - Multiple space normalization
|
|
84
|
+
/// - Excessive newline reduction
|
|
85
|
+
/// - Markdown structure preservation (if enabled)
|
|
86
|
+
/// - Code preservation (if enabled)
|
|
87
|
+
///
|
|
88
|
+
/// # Arguments
|
|
89
|
+
/// * `text` - The input text to filter
|
|
90
|
+
///
|
|
91
|
+
/// # Returns
|
|
92
|
+
/// A new `String` with light filters applied
|
|
66
93
|
pub fn apply_light_filters(&self, text: &str) -> String {
|
|
67
94
|
use std::borrow::Cow;
|
|
68
95
|
|
|
69
96
|
let mut result = Cow::Borrowed(text);
|
|
70
97
|
|
|
98
|
+
// Preserve markdown code blocks if configured
|
|
71
99
|
let mut preserved_blocks: Option<AHashMap<String, String>> = None;
|
|
72
100
|
if self.config.preserve_markdown {
|
|
73
101
|
let mut blocks = AHashMap::new();
|
|
74
|
-
result = Cow::Owned(
|
|
102
|
+
result = Cow::Owned(extract_and_preserve_code(result.as_ref(), &mut blocks));
|
|
75
103
|
preserved_blocks = Some(blocks);
|
|
76
104
|
}
|
|
77
105
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
if MULTIPLE_SPACES_REGEX.is_match(&result) {
|
|
83
|
-
result = Cow::Owned(MULTIPLE_SPACES_REGEX.replace_all(&result, " ").into_owned());
|
|
84
|
-
}
|
|
106
|
+
// Remove HTML comments
|
|
107
|
+
result = Cow::Owned(remove_html_comments(&result));
|
|
85
108
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
109
|
+
// Normalize whitespace
|
|
110
|
+
result = Cow::Owned(normalize_spaces(&result));
|
|
111
|
+
result = Cow::Owned(normalize_newlines(&result));
|
|
89
112
|
|
|
113
|
+
// Preserve markdown structure if configured
|
|
90
114
|
if self.config.preserve_markdown {
|
|
91
|
-
result = Cow::Owned(
|
|
115
|
+
result = Cow::Owned(preserve_markdown_structure(&result));
|
|
92
116
|
}
|
|
93
117
|
|
|
118
|
+
// Restore preserved code blocks
|
|
94
119
|
if let Some(blocks) = &preserved_blocks {
|
|
95
|
-
result = Cow::Owned(
|
|
120
|
+
result = Cow::Owned(restore_preserved_blocks(&result, blocks));
|
|
96
121
|
}
|
|
97
122
|
|
|
98
123
|
result.into_owned()
|
|
99
124
|
}
|
|
100
125
|
|
|
126
|
+
/// Applies moderate filtering to text, including stopword removal.
|
|
127
|
+
///
|
|
128
|
+
/// Moderate filters include all light filters plus:
|
|
129
|
+
/// - Stopword removal (with markdown awareness if enabled)
|
|
130
|
+
/// - Code preservation during stopword removal
|
|
131
|
+
///
|
|
132
|
+
/// # Arguments
|
|
133
|
+
/// * `text` - The input text to filter
|
|
134
|
+
///
|
|
135
|
+
/// # Returns
|
|
136
|
+
/// A new `String` with moderate filters applied
|
|
101
137
|
pub fn apply_moderate_filters(&self, text: &str) -> String {
|
|
102
138
|
let mut result = self.apply_light_filters(text);
|
|
103
139
|
|
|
140
|
+
// Preserve code blocks during stopword removal if configured
|
|
104
141
|
let mut preserved_blocks: Option<AHashMap<String, String>> = None;
|
|
105
142
|
if self.config.preserve_code {
|
|
106
143
|
let mut blocks = AHashMap::new();
|
|
107
|
-
result =
|
|
144
|
+
result = extract_and_preserve_code(&result, &mut blocks);
|
|
108
145
|
preserved_blocks = Some(blocks);
|
|
109
146
|
}
|
|
110
147
|
|
|
148
|
+
// Remove stopwords with markdown awareness if configured
|
|
111
149
|
if self.config.preserve_markdown {
|
|
112
150
|
result = self.remove_stopwords_preserving_markdown(&result);
|
|
113
151
|
} else {
|
|
114
|
-
result =
|
|
152
|
+
result = remove_stopwords(&result, &self.stopwords, &self.preserve_patterns);
|
|
115
153
|
}
|
|
116
154
|
|
|
155
|
+
// Restore preserved code blocks
|
|
117
156
|
if let Some(blocks) = &preserved_blocks {
|
|
118
|
-
result =
|
|
157
|
+
result = restore_preserved_blocks(&result, blocks);
|
|
119
158
|
}
|
|
120
159
|
|
|
121
160
|
result
|
|
122
161
|
}
|
|
123
162
|
|
|
163
|
+
/// Removes stopwords while preserving markdown structural elements.
|
|
164
|
+
///
|
|
165
|
+
/// This function processes text line-by-line, preserving:
|
|
166
|
+
/// - Markdown headers
|
|
167
|
+
/// - List items
|
|
168
|
+
/// - Table rows
|
|
169
|
+
///
|
|
170
|
+
/// # Arguments
|
|
171
|
+
/// * `text` - The input text to filter
|
|
172
|
+
///
|
|
173
|
+
/// # Returns
|
|
174
|
+
/// A new `String` with stopwords removed but markdown structure preserved
|
|
124
175
|
fn remove_stopwords_preserving_markdown(&self, text: &str) -> String {
|
|
125
176
|
let lines: Vec<&str> = text.lines().collect();
|
|
126
177
|
let mut processed_lines = Vec::with_capacity(lines.len());
|
|
127
178
|
|
|
128
179
|
for line in lines {
|
|
129
|
-
|
|
180
|
+
// Preserve markdown headers
|
|
181
|
+
if is_markdown_header(line) {
|
|
130
182
|
processed_lines.push(line.to_string());
|
|
131
183
|
continue;
|
|
132
184
|
}
|
|
133
185
|
|
|
134
|
-
|
|
186
|
+
// Preserve markdown list items
|
|
187
|
+
if is_markdown_list(line) {
|
|
135
188
|
processed_lines.push(line.to_string());
|
|
136
189
|
continue;
|
|
137
190
|
}
|
|
138
191
|
|
|
139
|
-
|
|
192
|
+
// Preserve markdown table rows
|
|
193
|
+
if is_markdown_table(line) {
|
|
140
194
|
processed_lines.push(line.to_string());
|
|
141
195
|
continue;
|
|
142
196
|
}
|
|
143
197
|
|
|
144
|
-
|
|
198
|
+
// Apply stopword removal to regular text lines
|
|
199
|
+
let processed_line = remove_stopwords(line, &self.stopwords, &self.preserve_patterns);
|
|
145
200
|
processed_lines.push(processed_line);
|
|
146
201
|
}
|
|
147
202
|
|
|
148
203
|
processed_lines.join("\n")
|
|
149
204
|
}
|
|
150
205
|
|
|
151
|
-
|
|
152
|
-
let words: Vec<&str> = text.split_whitespace().collect();
|
|
153
|
-
let mut filtered_words = Vec::with_capacity((words.len() as f32 * 0.7).ceil() as usize);
|
|
154
|
-
|
|
155
|
-
for word in words {
|
|
156
|
-
if word.is_empty() {
|
|
157
|
-
continue;
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
if self.should_preserve_word(word) {
|
|
161
|
-
filtered_words.push(word);
|
|
162
|
-
continue;
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
if word.len() > 1 && word.bytes().all(|b| b.is_ascii_uppercase() || !b.is_ascii_alphabetic()) {
|
|
166
|
-
filtered_words.push(word);
|
|
167
|
-
continue;
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
if word.bytes().any(|b| b.is_ascii_digit()) {
|
|
171
|
-
filtered_words.push(word);
|
|
172
|
-
continue;
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
let clean_word = if word.is_ascii() {
|
|
176
|
-
let clean_bytes: Vec<u8> = word
|
|
177
|
-
.bytes()
|
|
178
|
-
.filter(|&b| b.is_ascii_alphabetic())
|
|
179
|
-
.map(|b| b.to_ascii_lowercase())
|
|
180
|
-
.collect();
|
|
181
|
-
utf8_validation::string_from_utf8(clean_bytes).unwrap_or_else(|_| {
|
|
182
|
-
word.chars()
|
|
183
|
-
.filter(|c| c.is_alphabetic())
|
|
184
|
-
.collect::<String>()
|
|
185
|
-
.to_lowercase()
|
|
186
|
-
})
|
|
187
|
-
} else {
|
|
188
|
-
word.chars()
|
|
189
|
-
.filter(|c| c.is_alphabetic())
|
|
190
|
-
.collect::<String>()
|
|
191
|
-
.to_lowercase()
|
|
192
|
-
};
|
|
193
|
-
|
|
194
|
-
if clean_word.is_empty() {
|
|
195
|
-
filtered_words.push(word);
|
|
196
|
-
continue;
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
if clean_word.len() <= 1 {
|
|
200
|
-
filtered_words.push(word);
|
|
201
|
-
continue;
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
if !self.stopwords.contains(&clean_word) {
|
|
205
|
-
filtered_words.push(word);
|
|
206
|
-
}
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
filtered_words.join(" ")
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
/// Get the language code for this filter pipeline.
|
|
206
|
+
/// Gets the language code for this filter pipeline.
|
|
213
207
|
///
|
|
214
208
|
/// Primarily useful for testing and debugging to verify language configuration.
|
|
215
209
|
#[cfg_attr(not(test), allow(dead_code))]
|
|
216
210
|
pub fn language(&self) -> &str {
|
|
217
211
|
&self.language
|
|
218
212
|
}
|
|
219
|
-
|
|
220
|
-
/// Check if a word should be preserved based on configured patterns.
|
|
221
|
-
fn should_preserve_word(&self, word: &str) -> bool {
|
|
222
|
-
self.preserve_patterns.iter().any(|pattern| pattern.is_match(word))
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
/// Split a word into prefix (non-alphanumeric), core (alphanumeric), and suffix (non-alphanumeric).
|
|
226
|
-
///
|
|
227
|
-
/// This is useful for handling punctuation-wrapped words like "(hello)" or "world!".
|
|
228
|
-
/// Currently used in tests; reserved for future word boundary-aware filtering.
|
|
229
|
-
#[cfg_attr(not(test), allow(dead_code))]
|
|
230
|
-
fn split_word_boundaries(&self, word: &str) -> (String, String, String) {
|
|
231
|
-
let chars: Vec<char> = word.chars().collect();
|
|
232
|
-
let mut start = 0;
|
|
233
|
-
let mut end = chars.len();
|
|
234
|
-
|
|
235
|
-
while start < chars.len() && !chars[start].is_alphanumeric() {
|
|
236
|
-
start += 1;
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
while end > start && !chars[end - 1].is_alphanumeric() {
|
|
240
|
-
end -= 1;
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
let prefix: String = chars[..start].iter().collect();
|
|
244
|
-
let core: String = chars[start..end].iter().collect();
|
|
245
|
-
let suffix: String = chars[end..].iter().collect();
|
|
246
|
-
|
|
247
|
-
(prefix, core, suffix)
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
fn preserve_markdown_structure(&self, text: &str) -> String {
|
|
251
|
-
let lines: Vec<&str> = text.lines().collect();
|
|
252
|
-
let mut processed_lines = Vec::with_capacity(lines.len());
|
|
253
|
-
|
|
254
|
-
for line in lines {
|
|
255
|
-
if MARKDOWN_HEADERS_REGEX.is_match(line) {
|
|
256
|
-
processed_lines.push(line);
|
|
257
|
-
continue;
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
if MARKDOWN_LISTS_REGEX.is_match(line) {
|
|
261
|
-
processed_lines.push(line);
|
|
262
|
-
continue;
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
processed_lines.push(line);
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
processed_lines.join("\n")
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
fn extract_and_preserve_code(&self, text: &str, preserved: &mut AHashMap<String, String>) -> String {
|
|
272
|
-
let mut result = text.to_string();
|
|
273
|
-
let mut code_block_id = 0;
|
|
274
|
-
let mut inline_code_id = 0;
|
|
275
|
-
|
|
276
|
-
result = MARKDOWN_CODE_BLOCK_REGEX
|
|
277
|
-
.replace_all(&result, |caps: ®ex::Captures| {
|
|
278
|
-
let code_block = caps[0].to_string();
|
|
279
|
-
let placeholder = format!("__CODEBLOCK_{}__", code_block_id);
|
|
280
|
-
code_block_id += 1;
|
|
281
|
-
preserved.insert(placeholder.clone(), code_block);
|
|
282
|
-
placeholder
|
|
283
|
-
})
|
|
284
|
-
.to_string();
|
|
285
|
-
|
|
286
|
-
result = MARKDOWN_INLINE_CODE_REGEX
|
|
287
|
-
.replace_all(&result, |caps: ®ex::Captures| {
|
|
288
|
-
let inline_code = caps[0].to_string();
|
|
289
|
-
let placeholder = format!("__INLINECODE_{}__", inline_code_id);
|
|
290
|
-
inline_code_id += 1;
|
|
291
|
-
preserved.insert(placeholder.clone(), inline_code);
|
|
292
|
-
placeholder
|
|
293
|
-
})
|
|
294
|
-
.to_string();
|
|
295
|
-
|
|
296
|
-
result
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
fn restore_preserved_blocks(&self, text: &str, preserved: &AHashMap<String, String>) -> String {
|
|
300
|
-
if preserved.is_empty() {
|
|
301
|
-
return text.to_string();
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
let mut result = text.to_string();
|
|
305
|
-
|
|
306
|
-
for (placeholder, original_content) in preserved {
|
|
307
|
-
result = result.replace(placeholder, original_content);
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
result
|
|
311
|
-
}
|
|
312
213
|
}
|
|
313
214
|
|
|
314
215
|
#[cfg(all(test, feature = "stopwords"))]
|
|
315
216
|
mod tests {
|
|
217
|
+
use super::general::split_word_boundaries;
|
|
316
218
|
use super::*;
|
|
317
219
|
|
|
318
220
|
#[test]
|
|
@@ -321,7 +223,7 @@ mod tests {
|
|
|
321
223
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
322
224
|
|
|
323
225
|
let input = "The quick brown fox is jumping over the lazy dog";
|
|
324
|
-
let result =
|
|
226
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
325
227
|
|
|
326
228
|
assert!(!result.contains(" the "));
|
|
327
229
|
assert!(!result.contains(" is "));
|
|
@@ -341,7 +243,7 @@ mod tests {
|
|
|
341
243
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
342
244
|
|
|
343
245
|
let input = "The NASA mission is a success";
|
|
344
|
-
let result =
|
|
246
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
345
247
|
|
|
346
248
|
assert!(result.contains("NASA"));
|
|
347
249
|
assert!(result.contains("mission"));
|
|
@@ -411,7 +313,7 @@ mod tests {
|
|
|
411
313
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
412
314
|
|
|
413
315
|
let input = "The API is working WITH the SDK";
|
|
414
|
-
let result =
|
|
316
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
415
317
|
|
|
416
318
|
assert!(result.contains("API"));
|
|
417
319
|
assert!(result.contains("SDK"));
|
|
@@ -426,7 +328,7 @@ mod tests {
|
|
|
426
328
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
427
329
|
|
|
428
330
|
let input = "The version is 3.14 and the count is 42";
|
|
429
|
-
let result =
|
|
331
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
430
332
|
|
|
431
333
|
assert!(result.contains("3.14"));
|
|
432
334
|
assert!(result.contains("42"));
|
|
@@ -441,7 +343,7 @@ mod tests {
|
|
|
441
343
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
442
344
|
|
|
443
345
|
let input = "Hello, the world! This is great.";
|
|
444
|
-
let result =
|
|
346
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
445
347
|
|
|
446
348
|
assert!(result.contains("Hello,"));
|
|
447
349
|
assert!(result.contains("world!"));
|
|
@@ -465,7 +367,7 @@ mod tests {
|
|
|
465
367
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
466
368
|
|
|
467
369
|
let input = "This is a custom word test";
|
|
468
|
-
let result =
|
|
370
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
469
371
|
|
|
470
372
|
assert!(!result.contains("custom"));
|
|
471
373
|
assert!(!result.contains("word"));
|
|
@@ -478,7 +380,7 @@ mod tests {
|
|
|
478
380
|
let pipeline = FilterPipeline::new(&config, "es").unwrap();
|
|
479
381
|
|
|
480
382
|
let input = "El perro grande bonito tiene";
|
|
481
|
-
let result =
|
|
383
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
482
384
|
|
|
483
385
|
assert!(result.contains("perro"));
|
|
484
386
|
assert!(result.contains("grande"));
|
|
@@ -495,7 +397,7 @@ mod tests {
|
|
|
495
397
|
let pipeline = FilterPipeline::new(&config, "unknown").unwrap();
|
|
496
398
|
|
|
497
399
|
let input = "The quick test with unknown language";
|
|
498
|
-
let result =
|
|
400
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
499
401
|
|
|
500
402
|
assert!(!result.contains("The "));
|
|
501
403
|
assert!(result.contains("quick"));
|
|
@@ -561,11 +463,11 @@ mod tests {
|
|
|
561
463
|
preserve_code: true,
|
|
562
464
|
..Default::default()
|
|
563
465
|
});
|
|
564
|
-
let
|
|
466
|
+
let _pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
565
467
|
|
|
566
468
|
let mut preserved = AHashMap::new();
|
|
567
469
|
let input = "Text before\n```rust\nfn main() {}\n```\nText after";
|
|
568
|
-
let result =
|
|
470
|
+
let result = extract_and_preserve_code(input, &mut preserved);
|
|
569
471
|
|
|
570
472
|
assert_eq!(preserved.len(), 1);
|
|
571
473
|
assert!(preserved.values().any(|v| v.contains("fn main()")));
|
|
@@ -578,11 +480,11 @@ mod tests {
|
|
|
578
480
|
preserve_code: true,
|
|
579
481
|
..Default::default()
|
|
580
482
|
});
|
|
581
|
-
let
|
|
483
|
+
let _pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
582
484
|
|
|
583
485
|
let mut preserved = AHashMap::new();
|
|
584
486
|
let input = "Use the `println!` macro";
|
|
585
|
-
let result =
|
|
487
|
+
let result = extract_and_preserve_code(input, &mut preserved);
|
|
586
488
|
|
|
587
489
|
assert_eq!(preserved.len(), 1);
|
|
588
490
|
assert!(preserved.values().any(|v| v == "`println!`"));
|
|
@@ -592,13 +494,13 @@ mod tests {
|
|
|
592
494
|
#[test]
|
|
593
495
|
fn test_restore_preserved_blocks() {
|
|
594
496
|
let config = Arc::new(TokenReductionConfig::default());
|
|
595
|
-
let
|
|
497
|
+
let _pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
596
498
|
|
|
597
499
|
let mut preserved = AHashMap::new();
|
|
598
500
|
preserved.insert("__CODEBLOCK_0__".to_string(), "```code```".to_string());
|
|
599
501
|
preserved.insert("__INLINECODE_0__".to_string(), "`inline`".to_string());
|
|
600
502
|
let input = "Text __CODEBLOCK_0__ and __INLINECODE_0__ here";
|
|
601
|
-
let result =
|
|
503
|
+
let result = restore_preserved_blocks(input, &preserved);
|
|
602
504
|
|
|
603
505
|
assert!(result.contains("```code```"));
|
|
604
506
|
assert!(result.contains("`inline`"));
|
|
@@ -654,7 +556,7 @@ mod tests {
|
|
|
654
556
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
655
557
|
|
|
656
558
|
let input = "I a x test";
|
|
657
|
-
let result =
|
|
559
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
658
560
|
|
|
659
561
|
assert!(result.contains("I"));
|
|
660
562
|
assert!(result.contains("x"));
|
|
@@ -667,7 +569,7 @@ mod tests {
|
|
|
667
569
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
668
570
|
|
|
669
571
|
let input = "The Test Is Working";
|
|
670
|
-
let result =
|
|
572
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
671
573
|
|
|
672
574
|
assert!(!result.contains("The"));
|
|
673
575
|
assert!(!result.contains("Is"));
|
|
@@ -675,29 +577,18 @@ mod tests {
|
|
|
675
577
|
assert!(result.contains("Working"));
|
|
676
578
|
}
|
|
677
579
|
|
|
678
|
-
#[test]
|
|
679
|
-
fn test_lazy_regex_initialization() {
|
|
680
|
-
let _ = &*HTML_COMMENT_REGEX;
|
|
681
|
-
let _ = &*EXCESSIVE_NEWLINES_REGEX;
|
|
682
|
-
let _ = &*MULTIPLE_SPACES_REGEX;
|
|
683
|
-
let _ = &*MARKDOWN_CODE_BLOCK_REGEX;
|
|
684
|
-
let _ = &*MARKDOWN_INLINE_CODE_REGEX;
|
|
685
|
-
let _ = &*MARKDOWN_HEADERS_REGEX;
|
|
686
|
-
let _ = &*MARKDOWN_LISTS_REGEX;
|
|
687
|
-
}
|
|
688
|
-
|
|
689
580
|
#[test]
|
|
690
581
|
fn test_multiple_code_blocks_hashmap_approach() {
|
|
691
582
|
let config = Arc::new(TokenReductionConfig {
|
|
692
583
|
preserve_code: true,
|
|
693
584
|
..Default::default()
|
|
694
585
|
});
|
|
695
|
-
let
|
|
586
|
+
let _pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
696
587
|
|
|
697
588
|
let input =
|
|
698
589
|
"Start ```rust\nlet x = 1;\n``` middle `inline1` text ```python\nprint('hi')\n``` and `inline2` end";
|
|
699
590
|
let mut preserved = AHashMap::new();
|
|
700
|
-
let result =
|
|
591
|
+
let result = extract_and_preserve_code(input, &mut preserved);
|
|
701
592
|
|
|
702
593
|
assert_eq!(preserved.len(), 4);
|
|
703
594
|
assert!(preserved.contains_key("__CODEBLOCK_0__"));
|
|
@@ -710,7 +601,7 @@ mod tests {
|
|
|
710
601
|
assert_eq!(preserved.get("__INLINECODE_0__").unwrap(), "`inline1`");
|
|
711
602
|
assert_eq!(preserved.get("__INLINECODE_1__").unwrap(), "`inline2`");
|
|
712
603
|
|
|
713
|
-
let restored =
|
|
604
|
+
let restored = restore_preserved_blocks(&result, &preserved);
|
|
714
605
|
assert!(restored.contains("```rust\nlet x = 1;\n```"));
|
|
715
606
|
assert!(restored.contains("```python\nprint('hi')\n```"));
|
|
716
607
|
assert!(restored.contains("`inline1`"));
|
|
@@ -725,14 +616,14 @@ mod tests {
|
|
|
725
616
|
preserve_code: true,
|
|
726
617
|
..Default::default()
|
|
727
618
|
});
|
|
728
|
-
let
|
|
619
|
+
let _pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
729
620
|
|
|
730
621
|
let input = "Text `a` and `b` and `c` here";
|
|
731
622
|
let mut preserved = AHashMap::new();
|
|
732
|
-
let result =
|
|
623
|
+
let result = extract_and_preserve_code(input, &mut preserved);
|
|
733
624
|
|
|
734
625
|
assert_eq!(preserved.len(), 3);
|
|
735
|
-
let restored =
|
|
626
|
+
let restored = restore_preserved_blocks(&result, &preserved);
|
|
736
627
|
|
|
737
628
|
assert!(restored.contains("`a`"));
|
|
738
629
|
assert!(restored.contains("`b`"));
|
|
@@ -755,7 +646,7 @@ mod tests {
|
|
|
755
646
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
756
647
|
|
|
757
648
|
let input = "The NASA and HTTP protocols version 1.2.3 by @john";
|
|
758
|
-
let result =
|
|
649
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
759
650
|
|
|
760
651
|
assert!(result.contains("NASA"));
|
|
761
652
|
assert!(result.contains("HTTP"));
|
|
@@ -774,7 +665,7 @@ mod tests {
|
|
|
774
665
|
assert_eq!(pipeline_en.language(), "en");
|
|
775
666
|
|
|
776
667
|
let input_en = "the quick brown fox";
|
|
777
|
-
let result_en =
|
|
668
|
+
let result_en = remove_stopwords(input_en, &pipeline_en.stopwords, &pipeline_en.preserve_patterns);
|
|
778
669
|
assert!(!result_en.contains(" the "));
|
|
779
670
|
|
|
780
671
|
let config_de = Arc::new(TokenReductionConfig::default());
|
|
@@ -782,7 +673,7 @@ mod tests {
|
|
|
782
673
|
assert_eq!(pipeline_de.language(), "de");
|
|
783
674
|
|
|
784
675
|
let input_de = "der schnelle braune fuchs";
|
|
785
|
-
let result_de =
|
|
676
|
+
let result_de = remove_stopwords(input_de, &pipeline_de.stopwords, &pipeline_de.preserve_patterns);
|
|
786
677
|
assert!(!result_de.contains(" der "));
|
|
787
678
|
assert!(result_de.contains("schnelle"));
|
|
788
679
|
}
|
|
@@ -795,7 +686,7 @@ mod tests {
|
|
|
795
686
|
assert_eq!(pipeline.language(), "unsupported_lang");
|
|
796
687
|
|
|
797
688
|
let input = "the quick brown fox";
|
|
798
|
-
let result =
|
|
689
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
799
690
|
|
|
800
691
|
assert!(!result.contains(" the "));
|
|
801
692
|
assert!(result.contains("quick"));
|
|
@@ -803,30 +694,27 @@ mod tests {
|
|
|
803
694
|
|
|
804
695
|
#[test]
|
|
805
696
|
fn test_split_word_boundaries() {
|
|
806
|
-
let
|
|
807
|
-
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
808
|
-
|
|
809
|
-
let (prefix, core, suffix) = pipeline.split_word_boundaries("(hello)");
|
|
697
|
+
let (prefix, core, suffix) = split_word_boundaries("(hello)");
|
|
810
698
|
assert_eq!(prefix, "(");
|
|
811
699
|
assert_eq!(core, "hello");
|
|
812
700
|
assert_eq!(suffix, ")");
|
|
813
701
|
|
|
814
|
-
let (prefix2, core2, suffix2) =
|
|
702
|
+
let (prefix2, core2, suffix2) = split_word_boundaries("world!");
|
|
815
703
|
assert_eq!(prefix2, "");
|
|
816
704
|
assert_eq!(core2, "world");
|
|
817
705
|
assert_eq!(suffix2, "!");
|
|
818
706
|
|
|
819
|
-
let (prefix3, core3, suffix3) =
|
|
707
|
+
let (prefix3, core3, suffix3) = split_word_boundaries("'test");
|
|
820
708
|
assert_eq!(prefix3, "'");
|
|
821
709
|
assert_eq!(core3, "test");
|
|
822
710
|
assert_eq!(suffix3, "");
|
|
823
711
|
|
|
824
|
-
let (prefix4, core4, suffix4) =
|
|
712
|
+
let (prefix4, core4, suffix4) = split_word_boundaries("simple");
|
|
825
713
|
assert_eq!(prefix4, "");
|
|
826
714
|
assert_eq!(core4, "simple");
|
|
827
715
|
assert_eq!(suffix4, "");
|
|
828
716
|
|
|
829
|
-
let (prefix5, core5, suffix5) =
|
|
717
|
+
let (prefix5, core5, suffix5) = split_word_boundaries("\"example!!!\"");
|
|
830
718
|
assert_eq!(prefix5, "\"");
|
|
831
719
|
assert_eq!(core5, "example");
|
|
832
720
|
assert_eq!(suffix5, "!!!\"");
|
|
@@ -834,25 +722,22 @@ mod tests {
|
|
|
834
722
|
|
|
835
723
|
#[test]
|
|
836
724
|
fn test_split_word_boundaries_edge_cases() {
|
|
837
|
-
let
|
|
838
|
-
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
839
|
-
|
|
840
|
-
let (prefix, core, suffix) = pipeline.split_word_boundaries("!!!");
|
|
725
|
+
let (prefix, core, suffix) = split_word_boundaries("!!!");
|
|
841
726
|
assert_eq!(prefix, "!!!");
|
|
842
727
|
assert_eq!(core, "");
|
|
843
728
|
assert_eq!(suffix, "");
|
|
844
729
|
|
|
845
|
-
let (prefix2, core2, suffix2) =
|
|
730
|
+
let (prefix2, core2, suffix2) = split_word_boundaries("");
|
|
846
731
|
assert_eq!(prefix2, "");
|
|
847
732
|
assert_eq!(core2, "");
|
|
848
733
|
assert_eq!(suffix2, "");
|
|
849
734
|
|
|
850
|
-
let (prefix3, core3, suffix3) =
|
|
735
|
+
let (prefix3, core3, suffix3) = split_word_boundaries("a");
|
|
851
736
|
assert_eq!(prefix3, "");
|
|
852
737
|
assert_eq!(core3, "a");
|
|
853
738
|
assert_eq!(suffix3, "");
|
|
854
739
|
|
|
855
|
-
let (prefix4, core4, suffix4) =
|
|
740
|
+
let (prefix4, core4, suffix4) = split_word_boundaries("(café)");
|
|
856
741
|
assert_eq!(prefix4, "(");
|
|
857
742
|
assert_eq!(core4, "café");
|
|
858
743
|
assert_eq!(suffix4, ")");
|
|
@@ -874,7 +759,7 @@ mod tests {
|
|
|
874
759
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
875
760
|
|
|
876
761
|
let input = "this is a custom stopword test";
|
|
877
|
-
let result =
|
|
762
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
878
763
|
|
|
879
764
|
assert!(!result.contains(" custom "));
|
|
880
765
|
assert!(!result.contains(" stopword "));
|
|
@@ -894,7 +779,7 @@ mod tests {
|
|
|
894
779
|
let pipeline = FilterPipeline::new(&config, "en").unwrap();
|
|
895
780
|
|
|
896
781
|
let input = "The quick brown fox";
|
|
897
|
-
let result =
|
|
782
|
+
let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
|
|
898
783
|
|
|
899
784
|
assert!(!result.contains(" The "));
|
|
900
785
|
assert!(result.contains("quick"));
|