RubyGems - kreuzberg - Versions diffs - 4.0.8 → 4.1.0 - Mend

kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (308) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
data/ext/kreuzberg_rb/native/src/result.rs +326 -0
data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
data/lib/kreuzberg/config.rb +66 -0
data/lib/kreuzberg/result.rb +107 -2
data/lib/kreuzberg/types.rb +104 -0
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -4
data/sig/kreuzberg.rbs +105 -1
data/vendor/Cargo.toml +3 -3
data/vendor/kreuzberg/Cargo.toml +4 -3
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/config.rs +69 -0
data/vendor/kreuzberg/src/api/handlers.rs +99 -2
data/vendor/kreuzberg/src/api/mod.rs +14 -7
data/vendor/kreuzberg/src/api/router.rs +214 -0
data/vendor/kreuzberg/src/api/startup.rs +243 -0
data/vendor/kreuzberg/src/api/types.rs +78 -0
data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
data/vendor/kreuzberg/src/cache/core.rs +428 -0
data/vendor/kreuzberg/src/cache/mod.rs +21 -843
data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
data/vendor/kreuzberg/src/chunking/config.rs +52 -0
data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
data/vendor/kreuzberg/src/core/config/page.rs +57 -0
data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
data/vendor/kreuzberg/src/core/mod.rs +4 -1
data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
data/vendor/kreuzberg/src/embeddings.rs +136 -13
data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
data/vendor/kreuzberg/src/extractors/email.rs +2 -0
data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
data/vendor/kreuzberg/src/extractors/html.rs +80 -8
data/vendor/kreuzberg/src/extractors/image.rs +8 -1
data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
data/vendor/kreuzberg/src/extractors/text.rs +4 -0
data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
data/vendor/kreuzberg/src/lib.rs +2 -2
data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
data/vendor/kreuzberg/src/mcp/format.rs +211 -0
data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
data/vendor/kreuzberg/src/mcp/params.rs +196 -0
data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
data/vendor/kreuzberg/src/text/quality.rs +1 -1
data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
data/vendor/kreuzberg/src/types/djot.rs +209 -0
data/vendor/kreuzberg/src/types/extraction.rs +301 -0
data/vendor/kreuzberg/src/types/formats.rs +443 -0
data/vendor/kreuzberg/src/types/metadata.rs +560 -0
data/vendor/kreuzberg/src/types/mod.rs +281 -0
data/vendor/kreuzberg/src/types/page.rs +182 -0
data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
data/vendor/kreuzberg/src/types/tables.rs +39 -0
data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
data/vendor/kreuzberg/tests/api_embed.rs +6 -9
data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
data/vendor/kreuzberg/tests/core_integration.rs +1 -0
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
data/vendor/kreuzberg/tests/format_integration.rs +2 -0
data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
data/vendor/kreuzberg-ffi/src/error.rs +46 -14
data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
data/vendor/kreuzberg-ffi/src/result.rs +148 -122
data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
metadata +200 -28
data/vendor/kreuzberg/src/api/server.rs +0 -518
data/vendor/kreuzberg/src/core/config.rs +0 -1914
data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
data/vendor/kreuzberg/src/types.rs +0 -1713
data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
data/vendor/kreuzberg-ffi/src/config.rs +0 -1341

data/vendor/kreuzberg/src/stopwords/languages/germanic.rs ADDED Viewed

@@ -0,0 +1,36 @@
+//! Germanic language stopwords.
+//!
+//! Includes: English (en), German (de), Dutch (nl), Swedish (sv),
+//! Norwegian (no), Danish (da), Afrikaans (af).
+use ahash::{AHashMap, AHashSet};
+/// Macro to generate embedded stopwords for Germanic languages.
+macro_rules! embed_stopwords {
+    ($map:expr, $($lang:literal),* $(,)?) => {
+        $(
+            {
+                const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
+                match serde_json::from_str::<Vec<String>>(JSON) {
+                    Ok(words) => {
+                        let set: AHashSet<String> = words.into_iter().collect();
+                        $map.insert($lang.to_string(), set);
+                    }
+                    Err(e) => {
+                        panic!(
+                            "Failed to parse embedded stopwords for language '{}': {}. \
+                            This indicates corrupted or malformed JSON in the embedded stopwords data. \
+                            Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
+                            $lang, e
+                        );
+                    }
+                }
+            }
+        )*
+    };
+}
+/// Load Germanic language stopwords into the provided map.
+pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
+    embed_stopwords!(map, "en", "de", "nl", "sv", "no", "da", "af");
+}

data/vendor/kreuzberg/src/stopwords/languages/mod.rs ADDED Viewed

@@ -0,0 +1,10 @@
+//! Language family modules for stopword loading.
+//!
+//! Stopwords are organized by language family for easier maintenance
+//! and navigation. Each module handles loading stopwords for related languages.
+pub(super) mod asian;
+pub(super) mod germanic;
+pub(super) mod other;
+pub(super) mod romance;
+pub(super) mod slavic;

data/vendor/kreuzberg/src/stopwords/languages/other.rs ADDED Viewed

@@ -0,0 +1,44 @@
+//! Other language stopwords.
+//!
+//! Includes: Arabic (ar), Hebrew (he), Turkish (tr), Persian (fa),
+//! Kurdish (ku), Armenian (hy), Estonian (et), Basque (eu),
+//! Breton (br), Esperanto (eo), Finnish (fi), Irish (ga),
+//! Hungarian (hu), Indonesian (id), Latin (la), Lithuanian (lt),
+//! Latvian (lv), Malay (ms), Tagalog (tl), Greek (el),
+//! Hausa (ha), Swahili (sw), Yoruba (yo), Zulu (zu),
+//! Somali (so), Sesotho (st).
+use ahash::{AHashMap, AHashSet};
+/// Macro to generate embedded stopwords for other languages.
+macro_rules! embed_stopwords {
+    ($map:expr, $($lang:literal),* $(,)?) => {
+        $(
+            {
+                const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
+                match serde_json::from_str::<Vec<String>>(JSON) {
+                    Ok(words) => {
+                        let set: AHashSet<String> = words.into_iter().collect();
+                        $map.insert($lang.to_string(), set);
+                    }
+                    Err(e) => {
+                        panic!(
+                            "Failed to parse embedded stopwords for language '{}': {}. \
+                            This indicates corrupted or malformed JSON in the embedded stopwords data. \
+                            Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
+                            $lang, e
+                        );
+                    }
+                }
+            }
+        )*
+    };
+}
+/// Load other language stopwords into the provided map.
+pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
+    embed_stopwords!(
+        map, "ar", "he", "tr", "fa", "ku", "hy", "et", "eu", "br", "eo", "fi", "ga", "hu", "id", "la", "lt", "lv",
+        "ms", "tl", "el", "ha", "sw", "yo", "zu", "so", "st"
+    );
+}

data/vendor/kreuzberg/src/stopwords/languages/romance.rs ADDED Viewed

@@ -0,0 +1,36 @@
+//! Romance language stopwords.
+//!
+//! Includes: French (fr), Spanish (es), Italian (it), Portuguese (pt),
+//! Romanian (ro), Catalan (ca), Galician (gl).
+use ahash::{AHashMap, AHashSet};
+/// Macro to generate embedded stopwords for Romance languages.
+macro_rules! embed_stopwords {
+    ($map:expr, $($lang:literal),* $(,)?) => {
+        $(
+            {
+                const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
+                match serde_json::from_str::<Vec<String>>(JSON) {
+                    Ok(words) => {
+                        let set: AHashSet<String> = words.into_iter().collect();
+                        $map.insert($lang.to_string(), set);
+                    }
+                    Err(e) => {
+                        panic!(
+                            "Failed to parse embedded stopwords for language '{}': {}. \
+                            This indicates corrupted or malformed JSON in the embedded stopwords data. \
+                            Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
+                            $lang, e
+                        );
+                    }
+                }
+            }
+        )*
+    };
+}
+/// Load Romance language stopwords into the provided map.
+pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
+    embed_stopwords!(map, "fr", "es", "it", "pt", "ro", "ca", "gl");
+}

data/vendor/kreuzberg/src/stopwords/languages/slavic.rs ADDED Viewed

@@ -0,0 +1,36 @@
+//! Slavic language stopwords.
+//!
+//! Includes: Russian (ru), Polish (pl), Czech (cs), Ukrainian (uk),
+//! Bulgarian (bg), Slovak (sk), Croatian (hr), Slovenian (sl).
+use ahash::{AHashMap, AHashSet};
+/// Macro to generate embedded stopwords for Slavic languages.
+macro_rules! embed_stopwords {
+    ($map:expr, $($lang:literal),* $(,)?) => {
+        $(
+            {
+                const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
+                match serde_json::from_str::<Vec<String>>(JSON) {
+                    Ok(words) => {
+                        let set: AHashSet<String> = words.into_iter().collect();
+                        $map.insert($lang.to_string(), set);
+                    }
+                    Err(e) => {
+                        panic!(
+                            "Failed to parse embedded stopwords for language '{}': {}. \
+                            This indicates corrupted or malformed JSON in the embedded stopwords data. \
+                            Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
+                            $lang, e
+                        );
+                    }
+                }
+            }
+        )*
+    };
+}
+/// Load Slavic language stopwords into the provided map.
+pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
+    embed_stopwords!(map, "ru", "pl", "cs", "uk", "bg", "sk", "hr", "sl");
+}

data/vendor/kreuzberg/src/stopwords/mod.rs CHANGED Viewed

@@ -82,33 +82,7 @@
 use ahash::{AHashMap, AHashSet};
 use once_cell::sync::Lazy;
-/// Macro to generate embedded stopwords for all languages.
-///
-/// This macro embeds the JSON files at compile time using `include_str!()` and
-/// generates code to parse and insert them into the stopwords map.
-macro_rules! embed_stopwords {
-    ($map:expr, $($lang:literal),* $(,)?) => {
-        $(
-            {
-                const JSON: &str = include_str!(concat!("../../stopwords/", $lang, "_stopwords.json"));
-                match serde_json::from_str::<Vec<String>>(JSON) {
-                    Ok(words) => {
-                        let set: AHashSet<String> = words.into_iter().collect();
-                        $map.insert($lang.to_string(), set);
-                    }
-                    Err(e) => {
-                        panic!(
-                            "Failed to parse embedded stopwords for language '{}': {}. \
-                            This indicates corrupted or malformed JSON in the embedded stopwords data. \
-                            Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
-                            $lang, e
-                        );
-                    }
-                }
-            }
-        )*
-    };
-}
+mod languages;
 /// Global stopwords registry.
 ///
@@ -146,12 +120,12 @@ macro_rules! embed_stopwords {
 pub static STOPWORDS: Lazy<AHashMap<String, AHashSet<String>>> = Lazy::new(|| {
     let mut map = AHashMap::new();
-    embed_stopwords!(
-        map, "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi",
-        "fr", "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt",
-        "lv", "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw",
-        "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
-    );
+    // Load stopwords by language family
+    languages::germanic::load_stopwords(&mut map);
+    languages::romance::load_stopwords(&mut map);
+    languages::slavic::load_stopwords(&mut map);
+    languages::asian::load_stopwords(&mut map);
+    languages::other::load_stopwords(&mut map);
     apply_stopword_whitelist(&mut map);

data/vendor/kreuzberg/src/text/quality.rs CHANGED Viewed

@@ -677,7 +677,7 @@ mod tests {
     fn test_normalize_whitespace_cow_no_changes() {
         let text = Cow::Borrowed("normaltext");
         let result = normalize_whitespace_cow(text);
-        assert_eq!(result.as_ref(), "normaltext");
+        assert_eq!(&*result, "normaltext");
     }
     #[test]

data/vendor/kreuzberg/src/text/quality_processor.rs CHANGED Viewed

@@ -123,6 +123,8 @@ mod tests {
 	            chunks: None,
 	            images: None,
 	            pages: None,
+	            elements: None,
+	            djot_content: None,
 	        };
         processor.process(&mut result, &config).await.unwrap();
@@ -148,7 +150,9 @@ mod tests {
             detected_languages: None,
             chunks: None,
             images: None,
+            djot_content: None,
             pages: None,
+            elements: None,
         };
         processor.process(&mut result, &config).await.unwrap();
@@ -181,7 +185,9 @@ mod tests {
             detected_languages: None,
             chunks: None,
             images: None,
+            djot_content: None,
             pages: None,
+            elements: None,
         };
         let config_with_quality = ExtractionConfig {
@@ -209,7 +215,9 @@ mod tests {
             detected_languages: None,
             chunks: None,
             images: None,
+            djot_content: None,
             pages: None,
+            elements: None,
         };
         let long_result = ExtractionResult {
@@ -220,7 +228,9 @@ mod tests {
             detected_languages: None,
             chunks: None,
             images: None,
+            djot_content: None,
             pages: None,
+            elements: None,
         };
         let short_duration = processor.estimated_duration_ms(&short_result);

data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs ADDED Viewed

@@ -0,0 +1,238 @@
+use ahash::AHashMap;
+/// Bonus added for sentences at the beginning or end of the document
+const SENTENCE_EDGE_POSITION_BONUS: f32 = 0.3;
+/// Bonus added for sentences with ideal word count (neither too short nor too long)
+const IDEAL_WORD_COUNT_BONUS: f32 = 0.2;
+/// Minimum word count for ideal sentence length
+const MIN_IDEAL_WORD_COUNT: usize = 3;
+/// Maximum word count for ideal sentence length
+const MAX_IDEAL_WORD_COUNT: usize = 25;
+/// Weight multiplier for numeric content density in sentences
+const NUMERIC_CONTENT_WEIGHT: f32 = 0.3;
+/// Weight multiplier for capitalized/acronym word density in sentences
+const CAPS_ACRONYM_WEIGHT: f32 = 0.25;
+/// Weight multiplier for long word density in sentences
+const LONG_WORD_WEIGHT: f32 = 0.2;
+/// Minimum character length for a word to be considered "long"
+const LONG_WORD_THRESHOLD: usize = 8;
+/// Weight multiplier for punctuation density in sentences
+const PUNCTUATION_DENSITY_WEIGHT: f32 = 0.15;
+/// Weight multiplier for word diversity ratio (unique words / total words)
+const DIVERSITY_RATIO_WEIGHT: f32 = 0.15;
+/// Weight multiplier for character entropy (measure of text randomness/information)
+const CHAR_ENTROPY_WEIGHT: f32 = 0.1;
+/// Analyzes text characteristics and scores content importance.
+pub struct TextAnalyzer;
+impl TextAnalyzer {
+    /// Scores the importance of a sentence based on various characteristics.
+    pub fn score_sentence_importance(sentence: &str, position: usize, total_sentences: usize) -> f32 {
+        let mut score = 0.0;
+        if position == 0 || position == total_sentences - 1 {
+            score += SENTENCE_EDGE_POSITION_BONUS;
+        }
+        let words: Vec<&str> = sentence.split_whitespace().collect();
+        if words.is_empty() {
+            return score;
+        }
+        let word_count = words.len();
+        if (MIN_IDEAL_WORD_COUNT..=MAX_IDEAL_WORD_COUNT).contains(&word_count) {
+            score += IDEAL_WORD_COUNT_BONUS;
+        }
+        let mut numeric_count = 0;
+        let mut caps_count = 0;
+        let mut long_word_count = 0;
+        let mut punct_density = 0;
+        for word in &words {
+            if word.chars().any(|c| c.is_numeric()) {
+                numeric_count += 1;
+            }
+            if word.len() > 1 && word.chars().all(|c| c.is_uppercase()) {
+                caps_count += 1;
+            }
+            if word.len() > LONG_WORD_THRESHOLD {
+                long_word_count += 1;
+            }
+            punct_density += word.chars().filter(|c| c.is_ascii_punctuation()).count();
+        }
+        score += (numeric_count as f32 / words.len() as f32) * NUMERIC_CONTENT_WEIGHT;
+        score += (caps_count as f32 / words.len() as f32) * CAPS_ACRONYM_WEIGHT;
+        score += (long_word_count as f32 / words.len() as f32) * LONG_WORD_WEIGHT;
+        score += (punct_density as f32 / sentence.len() as f32) * PUNCTUATION_DENSITY_WEIGHT;
+        let estimated_unique = (words.len() as f32 * 0.6).ceil() as usize;
+        let mut unique_words: ahash::AHashSet<String> = ahash::AHashSet::with_capacity(estimated_unique.max(10));
+        for w in &words {
+            let clean = w
+                .chars()
+                .filter(|c| c.is_alphabetic())
+                .collect::<String>()
+                .to_lowercase();
+            unique_words.insert(clean);
+            if unique_words.len() >= estimated_unique {
+                break;
+            }
+        }
+        let final_unique_count = if unique_words.len() >= estimated_unique {
+            unique_words.len()
+        } else {
+            for w in &words {
+                let clean = w
+                    .chars()
+                    .filter(|c| c.is_alphabetic())
+                    .collect::<String>()
+                    .to_lowercase();
+                unique_words.insert(clean);
+            }
+            unique_words.len()
+        };
+        let diversity_ratio = final_unique_count as f32 / words.len() as f32;
+        score += diversity_ratio * DIVERSITY_RATIO_WEIGHT;
+        let char_entropy = Self::calculate_char_entropy(sentence);
+        score += char_entropy * CHAR_ENTROPY_WEIGHT;
+        score
+    }
+    /// Calculates character entropy (measure of text randomness/information content).
+    pub fn calculate_char_entropy(text: &str) -> f32 {
+        let chars: Vec<char> = text.chars().collect();
+        if chars.is_empty() {
+            return 0.0;
+        }
+        let estimated_unique = (chars.len() as f32 * 0.1).ceil() as usize;
+        let mut char_freq = AHashMap::with_capacity(estimated_unique.max(26));
+        for &ch in &chars {
+            let lowercase_ch = ch
+                .to_lowercase()
+                .next()
+                .expect("to_lowercase() must yield at least one character for valid Unicode");
+            *char_freq.entry(lowercase_ch).or_insert(0) += 1;
+        }
+        let total_chars = chars.len() as f32;
+        char_freq
+            .values()
+            .map(|&freq| {
+                let p = freq as f32 / total_chars;
+                if p > 0.0 { -p * p.log2() } else { 0.0 }
+            })
+            .sum::<f32>()
+            .min(5.0)
+    }
+    /// Checks if a word has important characteristics that should be preserved.
+    pub fn has_important_characteristics(word: &str) -> bool {
+        if word.len() > 1 && word.chars().all(|c| c.is_uppercase()) {
+            return true;
+        }
+        if word.chars().any(|c| c.is_numeric()) {
+            return true;
+        }
+        if word.len() > 10 {
+            return true;
+        }
+        let uppercase_count = word.chars().filter(|c| c.is_uppercase()).count();
+        if uppercase_count > 1 && uppercase_count < word.len() {
+            return true;
+        }
+        if Self::has_cjk_importance(word) {
+            return true;
+        }
+        false
+    }
+    /// Checks if a CJK word has important characteristics.
+    pub fn has_cjk_importance(word: &str) -> bool {
+        let chars: Vec<char> = word.chars().collect();
+        let has_cjk = chars.iter().any(|&c| c as u32 >= 0x4E00 && (c as u32) <= 0x9FFF);
+        if !has_cjk {
+            return false;
+        }
+        let important_radicals = [
+            '学', '智', '能', '技', '术', '法', '算', '理', '科', '研', '究', '发', '展', '系', '统', '模', '型', '方',
+            '式', '过', '程', '结', '构', '功', '效', '应', '分', '析', '计', '算', '数', '据', '信', '息', '处', '理',
+            '语', '言', '文', '生', '成', '产', '用', '作', '为', '成', '变', '化', '转', '换', '提', '高', '网', '络',
+            '神', '经', '机', '器', '人', '工', '智', '能', '自', '然', '复',
+        ];
+        for &char in &chars {
+            if important_radicals.contains(&char) {
+                return true;
+            }
+        }
+        if chars.len() == 2 && has_cjk {
+            let has_technical = chars.iter().any(|&c| {
+                let code = c as u32;
+                (0x4E00..=0x4FFF).contains(&code)
+                    || (0x5000..=0x51FF).contains(&code)
+                    || (0x6700..=0x68FF).contains(&code)
+                    || (0x7500..=0x76FF).contains(&code)
+            });
+            if has_technical {
+                return true;
+            }
+        }
+        false
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_calculate_char_entropy() {
+        let low_entropy = TextAnalyzer::calculate_char_entropy("aaaaaaa");
+        assert!(low_entropy < 1.0);
+        let high_entropy = TextAnalyzer::calculate_char_entropy("abcdefg123");
+        assert!(high_entropy > low_entropy);
+    }
+    #[test]
+    fn test_important_word_characteristics() {
+        assert!(TextAnalyzer::has_important_characteristics("IMPORTANT"));
+        assert!(TextAnalyzer::has_important_characteristics("COVID-19"));
+        assert!(TextAnalyzer::has_important_characteristics("PyTorch"));
+        assert!(TextAnalyzer::has_important_characteristics("verylongword123"));
+    }
+}

data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs ADDED Viewed

@@ -0,0 +1,8 @@
+mod analysis;
+mod punctuation;
+mod reducer;
+mod sentence_selection;
+mod word_filtering;
+// Re-export the main public interface
+pub use reducer::TokenReducer;

data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs ADDED Viewed

@@ -0,0 +1,54 @@
+use once_cell::sync::Lazy;
+use regex::Regex;
+use std::borrow::Cow;
+static REPEATED_EXCLAMATION: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"[!]{2,}").expect("Repeated exclamation regex pattern is valid and should compile"));
+static REPEATED_QUESTION: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"[?]{2,}").expect("Repeated question regex pattern is valid and should compile"));
+static REPEATED_COMMA: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"[,]{2,}").expect("Repeated comma regex pattern is valid and should compile"));
+/// Handles punctuation cleaning and normalization.
+pub struct PunctuationCleaner;
+impl PunctuationCleaner {
+    /// Cleans excessive punctuation from text using optimized Cow pattern.
+    pub fn clean_punctuation_optimized(text: &str) -> String {
+        let mut result = Cow::Borrowed(text);
+        if REPEATED_EXCLAMATION.is_match(&result) {
+            result = Cow::Owned(REPEATED_EXCLAMATION.replace_all(&result, "!").into_owned());
+        }
+        if REPEATED_QUESTION.is_match(&result) {
+            result = Cow::Owned(REPEATED_QUESTION.replace_all(&result, "?").into_owned());
+        }
+        if REPEATED_COMMA.is_match(&result) {
+            result = Cow::Owned(REPEATED_COMMA.replace_all(&result, ",").into_owned());
+        }
+        result.into_owned()
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_punctuation_normalization() {
+        let input = "Text!!!!!! with????? excessive,,,,,, punctuation";
+        let result = PunctuationCleaner::clean_punctuation_optimized(input);
+        assert!(!result.contains("!!!!!!"));
+        assert!(!result.contains("?????"));
+        assert!(!result.contains(",,,,,,"));
+    }
+    #[test]
+    fn test_punctuation_no_change() {
+        let input = "Text with normal punctuation!";
+        let result = PunctuationCleaner::clean_punctuation_optimized(input);
+        assert_eq!(result, input);
+    }
+}