kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
use crate::text::utf8_validation;
|
|
2
|
+
use ahash::AHashSet;
|
|
3
|
+
use once_cell::sync::Lazy;
|
|
4
|
+
use regex::Regex;
|
|
5
|
+
|
|
6
|
+
/// Regular expression for matching excessive newlines (3 or more consecutive newlines).
|
|
7
|
+
static EXCESSIVE_NEWLINES_REGEX: Lazy<Regex> =
|
|
8
|
+
Lazy::new(|| Regex::new(r"\n{3,}").expect("Excessive newlines regex pattern is valid and should compile"));
|
|
9
|
+
|
|
10
|
+
/// Regular expression for matching multiple consecutive spaces (2 or more).
|
|
11
|
+
static MULTIPLE_SPACES_REGEX: Lazy<Regex> =
|
|
12
|
+
Lazy::new(|| Regex::new(r" {2,}").expect("Multiple spaces regex pattern is valid and should compile"));
|
|
13
|
+
|
|
14
|
+
/// Normalizes whitespace in text by collapsing multiple spaces into a single space.
|
|
15
|
+
///
|
|
16
|
+
/// # Arguments
|
|
17
|
+
/// * `text` - The input text with potentially multiple consecutive spaces
|
|
18
|
+
///
|
|
19
|
+
/// # Returns
|
|
20
|
+
/// A new `String` with multiple spaces collapsed to single spaces
|
|
21
|
+
pub fn normalize_spaces(text: &str) -> String {
|
|
22
|
+
if MULTIPLE_SPACES_REGEX.is_match(text) {
|
|
23
|
+
MULTIPLE_SPACES_REGEX.replace_all(text, " ").into_owned()
|
|
24
|
+
} else {
|
|
25
|
+
text.to_string()
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/// Reduces excessive newlines in text by collapsing 3+ consecutive newlines into 2.
|
|
30
|
+
///
|
|
31
|
+
/// # Arguments
|
|
32
|
+
/// * `text` - The input text with potentially excessive newlines
|
|
33
|
+
///
|
|
34
|
+
/// # Returns
|
|
35
|
+
/// A new `String` with excessive newlines normalized to at most 2 consecutive newlines
|
|
36
|
+
pub fn normalize_newlines(text: &str) -> String {
|
|
37
|
+
if EXCESSIVE_NEWLINES_REGEX.is_match(text) {
|
|
38
|
+
EXCESSIVE_NEWLINES_REGEX.replace_all(text, "\n\n").into_owned()
|
|
39
|
+
} else {
|
|
40
|
+
text.to_string()
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// Removes stopwords from text while preserving important patterns.
|
|
45
|
+
///
|
|
46
|
+
/// This function intelligently filters out common stopwords while preserving:
|
|
47
|
+
/// - All-uppercase words (acronyms)
|
|
48
|
+
/// - Words containing digits
|
|
49
|
+
/// - Words matching custom preserve patterns
|
|
50
|
+
/// - Single-letter words
|
|
51
|
+
/// - Words with non-alphabetic characters
|
|
52
|
+
///
|
|
53
|
+
/// # Arguments
|
|
54
|
+
/// * `text` - The input text to filter
|
|
55
|
+
/// * `stopwords` - Set of stopwords to remove (should be lowercase)
|
|
56
|
+
/// * `preserve_patterns` - Regex patterns for words that should never be removed
|
|
57
|
+
///
|
|
58
|
+
/// # Returns
|
|
59
|
+
/// A new `String` with stopwords removed
|
|
60
|
+
pub fn remove_stopwords(text: &str, stopwords: &AHashSet<String>, preserve_patterns: &[Regex]) -> String {
|
|
61
|
+
let words: Vec<&str> = text.split_whitespace().collect();
|
|
62
|
+
let mut filtered_words = Vec::with_capacity((words.len() as f32 * 0.7).ceil() as usize);
|
|
63
|
+
|
|
64
|
+
for word in words {
|
|
65
|
+
if word.is_empty() {
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Check preserve patterns first
|
|
70
|
+
if should_preserve_word(word, preserve_patterns) {
|
|
71
|
+
filtered_words.push(word);
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Preserve all-uppercase words (acronyms like API, SDK, HTTP)
|
|
76
|
+
if word.len() > 1 && word.bytes().all(|b| b.is_ascii_uppercase() || !b.is_ascii_alphabetic()) {
|
|
77
|
+
filtered_words.push(word);
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Preserve words containing digits (version numbers, counts, etc.)
|
|
82
|
+
if word.bytes().any(|b| b.is_ascii_digit()) {
|
|
83
|
+
filtered_words.push(word);
|
|
84
|
+
continue;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Extract the alphabetic core of the word for stopword matching
|
|
88
|
+
let clean_word = if word.is_ascii() {
|
|
89
|
+
let clean_bytes: Vec<u8> = word
|
|
90
|
+
.bytes()
|
|
91
|
+
.filter(|&b| b.is_ascii_alphabetic())
|
|
92
|
+
.map(|b| b.to_ascii_lowercase())
|
|
93
|
+
.collect();
|
|
94
|
+
utf8_validation::string_from_utf8(clean_bytes).unwrap_or_else(|_| {
|
|
95
|
+
word.chars()
|
|
96
|
+
.filter(|c| c.is_alphabetic())
|
|
97
|
+
.collect::<String>()
|
|
98
|
+
.to_lowercase()
|
|
99
|
+
})
|
|
100
|
+
} else {
|
|
101
|
+
word.chars()
|
|
102
|
+
.filter(|c| c.is_alphabetic())
|
|
103
|
+
.collect::<String>()
|
|
104
|
+
.to_lowercase()
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
// If the clean word is empty (word was all punctuation), preserve it
|
|
108
|
+
if clean_word.is_empty() {
|
|
109
|
+
filtered_words.push(word);
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Preserve single-letter words
|
|
114
|
+
if clean_word.len() <= 1 {
|
|
115
|
+
filtered_words.push(word);
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Check if the clean word is a stopword
|
|
120
|
+
if !stopwords.contains(&clean_word) {
|
|
121
|
+
filtered_words.push(word);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
filtered_words.join(" ")
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/// Checks if a word should be preserved based on configured patterns.
|
|
129
|
+
///
|
|
130
|
+
/// # Arguments
|
|
131
|
+
/// * `word` - The word to check
|
|
132
|
+
/// * `preserve_patterns` - Regex patterns for words that should be preserved
|
|
133
|
+
///
|
|
134
|
+
/// # Returns
|
|
135
|
+
/// `true` if the word matches any preserve pattern, `false` otherwise
|
|
136
|
+
#[inline]
|
|
137
|
+
pub fn should_preserve_word(word: &str, preserve_patterns: &[Regex]) -> bool {
|
|
138
|
+
preserve_patterns.iter().any(|pattern| pattern.is_match(word))
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/// Splits a word into prefix (non-alphanumeric), core (alphanumeric), and suffix (non-alphanumeric).
|
|
142
|
+
///
|
|
143
|
+
/// This is useful for handling punctuation-wrapped words like "(hello)" or "world!".
|
|
144
|
+
/// Currently used in tests; reserved for future word boundary-aware filtering.
|
|
145
|
+
///
|
|
146
|
+
/// # Arguments
|
|
147
|
+
/// * `word` - The word to split
|
|
148
|
+
///
|
|
149
|
+
/// # Returns
|
|
150
|
+
/// A tuple of (prefix, core, suffix) strings
|
|
151
|
+
#[cfg(test)]
|
|
152
|
+
pub fn split_word_boundaries(word: &str) -> (String, String, String) {
|
|
153
|
+
let chars: Vec<char> = word.chars().collect();
|
|
154
|
+
let mut start = 0;
|
|
155
|
+
let mut end = chars.len();
|
|
156
|
+
|
|
157
|
+
// Find the start of alphanumeric content
|
|
158
|
+
while start < chars.len() && !chars[start].is_alphanumeric() {
|
|
159
|
+
start += 1;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Find the end of alphanumeric content
|
|
163
|
+
while end > start && !chars[end - 1].is_alphanumeric() {
|
|
164
|
+
end -= 1;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
let prefix: String = chars[..start].iter().collect();
|
|
168
|
+
let core: String = chars[start..end].iter().collect();
|
|
169
|
+
let suffix: String = chars[end..].iter().collect();
|
|
170
|
+
|
|
171
|
+
(prefix, core, suffix)
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
#[cfg(all(test, feature = "stopwords"))]
|
|
175
|
+
mod tests {
|
|
176
|
+
use super::*;
|
|
177
|
+
|
|
178
|
+
fn create_test_stopwords() -> AHashSet<String> {
|
|
179
|
+
let mut set = AHashSet::new();
|
|
180
|
+
set.insert("the".to_string());
|
|
181
|
+
set.insert("is".to_string());
|
|
182
|
+
set.insert("a".to_string());
|
|
183
|
+
set.insert("and".to_string());
|
|
184
|
+
set.insert("with".to_string());
|
|
185
|
+
set.insert("by".to_string());
|
|
186
|
+
set
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
#[test]
|
|
190
|
+
fn test_normalize_spaces() {
|
|
191
|
+
let input = "Text with multiple spaces";
|
|
192
|
+
let result = normalize_spaces(input);
|
|
193
|
+
assert!(!result.contains(" "));
|
|
194
|
+
assert!(result.contains("Text with multiple spaces"));
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
#[test]
|
|
198
|
+
fn test_normalize_spaces_no_change() {
|
|
199
|
+
let input = "Text with single spaces";
|
|
200
|
+
let result = normalize_spaces(input);
|
|
201
|
+
assert_eq!(result, input);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
#[test]
|
|
205
|
+
fn test_normalize_newlines() {
|
|
206
|
+
let input = "Paragraph 1\n\n\n\n\nParagraph 2";
|
|
207
|
+
let result = normalize_newlines(input);
|
|
208
|
+
assert!(!result.contains("\n\n\n"));
|
|
209
|
+
assert!(result.contains("Paragraph 1"));
|
|
210
|
+
assert!(result.contains("Paragraph 2"));
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
#[test]
|
|
214
|
+
fn test_normalize_newlines_no_change() {
|
|
215
|
+
let input = "Paragraph 1\n\nParagraph 2";
|
|
216
|
+
let result = normalize_newlines(input);
|
|
217
|
+
assert_eq!(result, input);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
#[test]
|
|
221
|
+
fn test_remove_stopwords() {
|
|
222
|
+
let stopwords = create_test_stopwords();
|
|
223
|
+
let preserve_patterns = vec![];
|
|
224
|
+
|
|
225
|
+
let input = "The quick brown fox is jumping over the lazy dog";
|
|
226
|
+
let result = remove_stopwords(input, &stopwords, &preserve_patterns);
|
|
227
|
+
|
|
228
|
+
assert!(!result.contains(" the "));
|
|
229
|
+
assert!(!result.contains(" is "));
|
|
230
|
+
assert!(result.contains("quick"));
|
|
231
|
+
assert!(result.contains("brown"));
|
|
232
|
+
assert!(result.contains("fox"));
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
#[test]
|
|
236
|
+
fn test_remove_stopwords_preserves_uppercase() {
|
|
237
|
+
let stopwords = create_test_stopwords();
|
|
238
|
+
let preserve_patterns = vec![];
|
|
239
|
+
|
|
240
|
+
let input = "The API is working WITH the SDK";
|
|
241
|
+
let result = remove_stopwords(input, &stopwords, &preserve_patterns);
|
|
242
|
+
|
|
243
|
+
assert!(result.contains("API"));
|
|
244
|
+
assert!(result.contains("SDK"));
|
|
245
|
+
assert!(result.contains("WITH"));
|
|
246
|
+
assert!(!result.contains("The "));
|
|
247
|
+
assert!(!result.contains(" is "));
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
#[test]
|
|
251
|
+
fn test_remove_stopwords_preserves_numbers() {
|
|
252
|
+
let stopwords = create_test_stopwords();
|
|
253
|
+
let preserve_patterns = vec![];
|
|
254
|
+
|
|
255
|
+
let input = "The version is 3.14 and the count is 42";
|
|
256
|
+
let result = remove_stopwords(input, &stopwords, &preserve_patterns);
|
|
257
|
+
|
|
258
|
+
assert!(result.contains("3.14"));
|
|
259
|
+
assert!(result.contains("42"));
|
|
260
|
+
assert!(result.contains("version"));
|
|
261
|
+
assert!(result.contains("count"));
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
#[cfg_attr(coverage, ignore = "coverage instrumentation disables SIMD stopword paths")]
|
|
265
|
+
#[test]
|
|
266
|
+
fn test_remove_stopwords_handles_punctuation() {
|
|
267
|
+
let stopwords = create_test_stopwords();
|
|
268
|
+
let preserve_patterns = vec![];
|
|
269
|
+
|
|
270
|
+
let input = "Hello, the world! This is great.";
|
|
271
|
+
let result = remove_stopwords(input, &stopwords, &preserve_patterns);
|
|
272
|
+
|
|
273
|
+
assert!(result.contains("Hello,"));
|
|
274
|
+
assert!(result.contains("world!"));
|
|
275
|
+
assert!(result.contains("great."));
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
#[test]
|
|
279
|
+
fn test_remove_stopwords_single_letter() {
|
|
280
|
+
let stopwords = create_test_stopwords();
|
|
281
|
+
let preserve_patterns = vec![];
|
|
282
|
+
|
|
283
|
+
let input = "I a x test";
|
|
284
|
+
let result = remove_stopwords(input, &stopwords, &preserve_patterns);
|
|
285
|
+
|
|
286
|
+
assert!(result.contains("I"));
|
|
287
|
+
assert!(result.contains("x"));
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
#[test]
|
|
291
|
+
fn test_preserve_patterns() {
|
|
292
|
+
let stopwords = create_test_stopwords();
|
|
293
|
+
let preserve_patterns = vec![
|
|
294
|
+
Regex::new(r"\b[A-Z]{2,}\b").unwrap(),
|
|
295
|
+
Regex::new(r"\b\d+\.\d+\.\d+\b").unwrap(),
|
|
296
|
+
Regex::new(r"@\w+").unwrap(),
|
|
297
|
+
];
|
|
298
|
+
|
|
299
|
+
let input = "The NASA and HTTP protocols version 1.2.3 by @john";
|
|
300
|
+
let result = remove_stopwords(input, &stopwords, &preserve_patterns);
|
|
301
|
+
|
|
302
|
+
assert!(result.contains("NASA"));
|
|
303
|
+
assert!(result.contains("HTTP"));
|
|
304
|
+
assert!(result.contains("1.2.3"));
|
|
305
|
+
assert!(result.contains("@john"));
|
|
306
|
+
|
|
307
|
+
assert!(!result.contains(" the "));
|
|
308
|
+
assert!(!result.contains(" and "));
|
|
309
|
+
assert!(!result.contains(" by "));
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
#[test]
|
|
313
|
+
fn test_should_preserve_word() {
|
|
314
|
+
let patterns = vec![Regex::new(r"\b[A-Z]{2,}\b").unwrap()];
|
|
315
|
+
|
|
316
|
+
assert!(should_preserve_word("NASA", &patterns));
|
|
317
|
+
assert!(should_preserve_word("HTTP", &patterns));
|
|
318
|
+
assert!(!should_preserve_word("hello", &patterns));
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
#[test]
|
|
322
|
+
fn test_split_word_boundaries() {
|
|
323
|
+
let (prefix, core, suffix) = split_word_boundaries("(hello)");
|
|
324
|
+
assert_eq!(prefix, "(");
|
|
325
|
+
assert_eq!(core, "hello");
|
|
326
|
+
assert_eq!(suffix, ")");
|
|
327
|
+
|
|
328
|
+
let (prefix2, core2, suffix2) = split_word_boundaries("world!");
|
|
329
|
+
assert_eq!(prefix2, "");
|
|
330
|
+
assert_eq!(core2, "world");
|
|
331
|
+
assert_eq!(suffix2, "!");
|
|
332
|
+
|
|
333
|
+
let (prefix3, core3, suffix3) = split_word_boundaries("'test");
|
|
334
|
+
assert_eq!(prefix3, "'");
|
|
335
|
+
assert_eq!(core3, "test");
|
|
336
|
+
assert_eq!(suffix3, "");
|
|
337
|
+
|
|
338
|
+
let (prefix4, core4, suffix4) = split_word_boundaries("simple");
|
|
339
|
+
assert_eq!(prefix4, "");
|
|
340
|
+
assert_eq!(core4, "simple");
|
|
341
|
+
assert_eq!(suffix4, "");
|
|
342
|
+
|
|
343
|
+
let (prefix5, core5, suffix5) = split_word_boundaries("\"example!!!\"");
|
|
344
|
+
assert_eq!(prefix5, "\"");
|
|
345
|
+
assert_eq!(core5, "example");
|
|
346
|
+
assert_eq!(suffix5, "!!!\"");
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
#[test]
|
|
350
|
+
fn test_split_word_boundaries_edge_cases() {
|
|
351
|
+
let (prefix, core, suffix) = split_word_boundaries("!!!");
|
|
352
|
+
assert_eq!(prefix, "!!!");
|
|
353
|
+
assert_eq!(core, "");
|
|
354
|
+
assert_eq!(suffix, "");
|
|
355
|
+
|
|
356
|
+
let (prefix2, core2, suffix2) = split_word_boundaries("");
|
|
357
|
+
assert_eq!(prefix2, "");
|
|
358
|
+
assert_eq!(core2, "");
|
|
359
|
+
assert_eq!(suffix2, "");
|
|
360
|
+
|
|
361
|
+
let (prefix3, core3, suffix3) = split_word_boundaries("a");
|
|
362
|
+
assert_eq!(prefix3, "");
|
|
363
|
+
assert_eq!(core3, "a");
|
|
364
|
+
assert_eq!(suffix3, "");
|
|
365
|
+
|
|
366
|
+
let (prefix4, core4, suffix4) = split_word_boundaries("(café)");
|
|
367
|
+
assert_eq!(prefix4, "(");
|
|
368
|
+
assert_eq!(core4, "café");
|
|
369
|
+
assert_eq!(suffix4, ")");
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
#[test]
|
|
373
|
+
fn test_lazy_regex_initialization() {
|
|
374
|
+
let _ = &*EXCESSIVE_NEWLINES_REGEX;
|
|
375
|
+
let _ = &*MULTIPLE_SPACES_REGEX;
|
|
376
|
+
}
|
|
377
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
use once_cell::sync::Lazy;
|
|
2
|
+
use regex::Regex;
|
|
3
|
+
|
|
4
|
+
/// Regular expression for matching HTML comments.
|
|
5
|
+
/// Matches the pattern `<!-- ... -->` for removing HTML comments from text.
|
|
6
|
+
static HTML_COMMENT_REGEX: Lazy<Regex> =
|
|
7
|
+
Lazy::new(|| Regex::new(r"<!--.*?-->").expect("HTML comment regex pattern is valid and should compile"));
|
|
8
|
+
|
|
9
|
+
/// Removes HTML comments from the input text.
|
|
10
|
+
///
|
|
11
|
+
/// This function uses a regex to strip out all HTML comment blocks (`<!-- ... -->`).
|
|
12
|
+
///
|
|
13
|
+
/// # Arguments
|
|
14
|
+
/// * `text` - The input text that may contain HTML comments
|
|
15
|
+
///
|
|
16
|
+
/// # Returns
|
|
17
|
+
/// A new `String` with all HTML comments removed
|
|
18
|
+
pub fn remove_html_comments(text: &str) -> String {
|
|
19
|
+
if HTML_COMMENT_REGEX.is_match(text) {
|
|
20
|
+
HTML_COMMENT_REGEX.replace_all(text, "").into_owned()
|
|
21
|
+
} else {
|
|
22
|
+
text.to_string()
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
#[cfg(test)]
|
|
27
|
+
mod tests {
|
|
28
|
+
use super::*;
|
|
29
|
+
|
|
30
|
+
#[test]
|
|
31
|
+
fn test_remove_html_comments() {
|
|
32
|
+
let input = "Text before <!-- comment --> text after";
|
|
33
|
+
let result = remove_html_comments(input);
|
|
34
|
+
|
|
35
|
+
assert!(!result.contains("<!-- comment -->"));
|
|
36
|
+
assert!(result.contains("Text before"));
|
|
37
|
+
assert!(result.contains("text after"));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
#[test]
|
|
41
|
+
fn test_no_html_comments() {
|
|
42
|
+
let input = "Text without comments";
|
|
43
|
+
let result = remove_html_comments(input);
|
|
44
|
+
assert_eq!(result, input);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
#[test]
|
|
48
|
+
fn test_lazy_regex_initialization() {
|
|
49
|
+
let _ = &*HTML_COMMENT_REGEX;
|
|
50
|
+
}
|
|
51
|
+
}
|