kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
use ahash::AHashMap;
|
|
2
|
+
use once_cell::sync::Lazy;
|
|
3
|
+
use regex::Regex;
|
|
4
|
+
|
|
5
|
+
/// Regular expression for matching Markdown code blocks.
|
|
6
|
+
/// Matches triple-backtick code blocks: ```...```
|
|
7
|
+
static MARKDOWN_CODE_BLOCK_REGEX: Lazy<Regex> =
|
|
8
|
+
Lazy::new(|| Regex::new(r"```[\s\S]*?```").expect("Markdown code block regex pattern is valid and should compile"));
|
|
9
|
+
|
|
10
|
+
/// Regular expression for matching Markdown inline code.
|
|
11
|
+
/// Matches single-backtick inline code: `code`
|
|
12
|
+
static MARKDOWN_INLINE_CODE_REGEX: Lazy<Regex> =
|
|
13
|
+
Lazy::new(|| Regex::new(r"`[^`\n]+`").expect("Markdown inline code regex pattern is valid and should compile"));
|
|
14
|
+
|
|
15
|
+
/// Regular expression for matching Markdown headers.
|
|
16
|
+
/// Matches headers like `# Header`, `## Header`, etc.
|
|
17
|
+
static MARKDOWN_HEADERS_REGEX: Lazy<Regex> =
|
|
18
|
+
Lazy::new(|| Regex::new(r"^#{1,6}\s+").expect("Markdown headers regex pattern is valid and should compile"));
|
|
19
|
+
|
|
20
|
+
/// Regular expression for matching Markdown list items.
|
|
21
|
+
/// Matches list markers: `- `, `* `, `+ ` at the start of lines
|
|
22
|
+
static MARKDOWN_LISTS_REGEX: Lazy<Regex> =
|
|
23
|
+
Lazy::new(|| Regex::new(r"^[ \t]*[-*+]\s+").expect("Markdown lists regex pattern is valid and should compile"));
|
|
24
|
+
|
|
25
|
+
/// Extracts and preserves Markdown code blocks and inline code by replacing them with placeholders.
|
|
26
|
+
///
|
|
27
|
+
/// This function scans the input text for Markdown code blocks (``` ... ```) and inline code (` ... `),
|
|
28
|
+
/// replaces them with unique placeholders, and stores the original content in a hashmap.
|
|
29
|
+
///
|
|
30
|
+
/// # Arguments
|
|
31
|
+
/// * `text` - The input text containing Markdown code
|
|
32
|
+
/// * `preserved` - A mutable hashmap to store the preserved code blocks
|
|
33
|
+
///
|
|
34
|
+
/// # Returns
|
|
35
|
+
/// A new `String` with code blocks replaced by placeholders
|
|
36
|
+
pub fn extract_and_preserve_code(text: &str, preserved: &mut AHashMap<String, String>) -> String {
|
|
37
|
+
let mut result = text.to_string();
|
|
38
|
+
let mut code_block_id = 0;
|
|
39
|
+
let mut inline_code_id = 0;
|
|
40
|
+
|
|
41
|
+
// Extract code blocks first
|
|
42
|
+
result = MARKDOWN_CODE_BLOCK_REGEX
|
|
43
|
+
.replace_all(&result, |caps: ®ex::Captures| {
|
|
44
|
+
let code_block = caps[0].to_string();
|
|
45
|
+
let placeholder = format!("__CODEBLOCK_{}__", code_block_id);
|
|
46
|
+
code_block_id += 1;
|
|
47
|
+
preserved.insert(placeholder.clone(), code_block);
|
|
48
|
+
placeholder
|
|
49
|
+
})
|
|
50
|
+
.to_string();
|
|
51
|
+
|
|
52
|
+
// Extract inline code
|
|
53
|
+
result = MARKDOWN_INLINE_CODE_REGEX
|
|
54
|
+
.replace_all(&result, |caps: ®ex::Captures| {
|
|
55
|
+
let inline_code = caps[0].to_string();
|
|
56
|
+
let placeholder = format!("__INLINECODE_{}__", inline_code_id);
|
|
57
|
+
inline_code_id += 1;
|
|
58
|
+
preserved.insert(placeholder.clone(), inline_code);
|
|
59
|
+
placeholder
|
|
60
|
+
})
|
|
61
|
+
.to_string();
|
|
62
|
+
|
|
63
|
+
result
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/// Restores preserved code blocks by replacing placeholders with their original content.
|
|
67
|
+
///
|
|
68
|
+
/// # Arguments
|
|
69
|
+
/// * `text` - The text containing placeholders
|
|
70
|
+
/// * `preserved` - The hashmap containing the original code blocks
|
|
71
|
+
///
|
|
72
|
+
/// # Returns
|
|
73
|
+
/// A new `String` with placeholders replaced by their original content
|
|
74
|
+
pub fn restore_preserved_blocks(text: &str, preserved: &AHashMap<String, String>) -> String {
|
|
75
|
+
if preserved.is_empty() {
|
|
76
|
+
return text.to_string();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
let mut result = text.to_string();
|
|
80
|
+
|
|
81
|
+
for (placeholder, original_content) in preserved {
|
|
82
|
+
result = result.replace(placeholder, original_content);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
result
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/// Preserves Markdown structure elements like headers, lists, and tables.
|
|
89
|
+
///
|
|
90
|
+
/// This function processes text line-by-line and preserves lines that contain
|
|
91
|
+
/// Markdown structural elements without modification.
|
|
92
|
+
///
|
|
93
|
+
/// # Arguments
|
|
94
|
+
/// * `text` - The input text with Markdown structure
|
|
95
|
+
///
|
|
96
|
+
/// # Returns
|
|
97
|
+
/// A new `String` with Markdown structure preserved
|
|
98
|
+
pub fn preserve_markdown_structure(text: &str) -> String {
|
|
99
|
+
let lines: Vec<&str> = text.lines().collect();
|
|
100
|
+
let mut processed_lines = Vec::with_capacity(lines.len());
|
|
101
|
+
|
|
102
|
+
for line in lines {
|
|
103
|
+
// Preserve headers
|
|
104
|
+
if MARKDOWN_HEADERS_REGEX.is_match(line) {
|
|
105
|
+
processed_lines.push(line);
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Preserve list items
|
|
110
|
+
if MARKDOWN_LISTS_REGEX.is_match(line) {
|
|
111
|
+
processed_lines.push(line);
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
processed_lines.push(line);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
processed_lines.join("\n")
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/// Checks if a line is a Markdown header.
|
|
122
|
+
///
|
|
123
|
+
/// # Arguments
|
|
124
|
+
/// * `line` - The line to check
|
|
125
|
+
///
|
|
126
|
+
/// # Returns
|
|
127
|
+
/// `true` if the line is a Markdown header, `false` otherwise
|
|
128
|
+
#[inline]
|
|
129
|
+
pub fn is_markdown_header(line: &str) -> bool {
|
|
130
|
+
MARKDOWN_HEADERS_REGEX.is_match(line)
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/// Checks if a line is a Markdown list item.
|
|
134
|
+
///
|
|
135
|
+
/// # Arguments
|
|
136
|
+
/// * `line` - The line to check
|
|
137
|
+
///
|
|
138
|
+
/// # Returns
|
|
139
|
+
/// `true` if the line is a Markdown list item, `false` otherwise
|
|
140
|
+
#[inline]
|
|
141
|
+
pub fn is_markdown_list(line: &str) -> bool {
|
|
142
|
+
MARKDOWN_LISTS_REGEX.is_match(line)
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/// Checks if a line is a Markdown table row.
|
|
146
|
+
///
|
|
147
|
+
/// # Arguments
|
|
148
|
+
/// * `line` - The line to check
|
|
149
|
+
///
|
|
150
|
+
/// # Returns
|
|
151
|
+
/// `true` if the line appears to be a Markdown table row, `false` otherwise
|
|
152
|
+
#[inline]
|
|
153
|
+
pub fn is_markdown_table(line: &str) -> bool {
|
|
154
|
+
let trimmed = line.trim();
|
|
155
|
+
trimmed.starts_with('|') && trimmed.ends_with('|')
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
#[cfg(test)]
|
|
159
|
+
mod tests {
|
|
160
|
+
use super::*;
|
|
161
|
+
|
|
162
|
+
#[test]
|
|
163
|
+
fn test_extract_code_block() {
|
|
164
|
+
let mut preserved = AHashMap::new();
|
|
165
|
+
let input = "Text before\n```rust\nfn main() {}\n```\nText after";
|
|
166
|
+
let result = extract_and_preserve_code(input, &mut preserved);
|
|
167
|
+
|
|
168
|
+
assert_eq!(preserved.len(), 1);
|
|
169
|
+
assert!(preserved.values().any(|v| v.contains("fn main()")));
|
|
170
|
+
assert!(result.contains("__CODEBLOCK_0__"));
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
#[test]
|
|
174
|
+
fn test_extract_inline_code() {
|
|
175
|
+
let mut preserved = AHashMap::new();
|
|
176
|
+
let input = "Use the `println!` macro";
|
|
177
|
+
let result = extract_and_preserve_code(input, &mut preserved);
|
|
178
|
+
|
|
179
|
+
assert_eq!(preserved.len(), 1);
|
|
180
|
+
assert!(preserved.values().any(|v| v == "`println!`"));
|
|
181
|
+
assert!(result.contains("__INLINECODE_0__"));
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
#[test]
|
|
185
|
+
fn test_multiple_code_blocks() {
|
|
186
|
+
let input =
|
|
187
|
+
"Start ```rust\nlet x = 1;\n``` middle `inline1` text ```python\nprint('hi')\n``` and `inline2` end";
|
|
188
|
+
let mut preserved = AHashMap::new();
|
|
189
|
+
let result = extract_and_preserve_code(input, &mut preserved);
|
|
190
|
+
|
|
191
|
+
assert_eq!(preserved.len(), 4);
|
|
192
|
+
assert!(preserved.contains_key("__CODEBLOCK_0__"));
|
|
193
|
+
assert!(preserved.contains_key("__CODEBLOCK_1__"));
|
|
194
|
+
assert!(preserved.contains_key("__INLINECODE_0__"));
|
|
195
|
+
assert!(preserved.contains_key("__INLINECODE_1__"));
|
|
196
|
+
|
|
197
|
+
assert_eq!(preserved.get("__CODEBLOCK_0__").unwrap(), "```rust\nlet x = 1;\n```");
|
|
198
|
+
assert_eq!(preserved.get("__CODEBLOCK_1__").unwrap(), "```python\nprint('hi')\n```");
|
|
199
|
+
assert_eq!(preserved.get("__INLINECODE_0__").unwrap(), "`inline1`");
|
|
200
|
+
assert_eq!(preserved.get("__INLINECODE_1__").unwrap(), "`inline2`");
|
|
201
|
+
|
|
202
|
+
let restored = restore_preserved_blocks(&result, &preserved);
|
|
203
|
+
assert!(restored.contains("```rust\nlet x = 1;\n```"));
|
|
204
|
+
assert!(restored.contains("```python\nprint('hi')\n```"));
|
|
205
|
+
assert!(restored.contains("`inline1`"));
|
|
206
|
+
assert!(restored.contains("`inline2`"));
|
|
207
|
+
assert!(!restored.contains("__CODEBLOCK_"));
|
|
208
|
+
assert!(!restored.contains("__INLINECODE_"));
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
#[test]
|
|
212
|
+
fn test_restore_preserved_blocks() {
|
|
213
|
+
let mut preserved = AHashMap::new();
|
|
214
|
+
preserved.insert("__CODEBLOCK_0__".to_string(), "```code```".to_string());
|
|
215
|
+
preserved.insert("__INLINECODE_0__".to_string(), "`inline`".to_string());
|
|
216
|
+
let input = "Text __CODEBLOCK_0__ and __INLINECODE_0__ here";
|
|
217
|
+
let result = restore_preserved_blocks(input, &preserved);
|
|
218
|
+
|
|
219
|
+
assert!(result.contains("```code```"));
|
|
220
|
+
assert!(result.contains("`inline`"));
|
|
221
|
+
assert!(!result.contains("__CODEBLOCK_0__"));
|
|
222
|
+
assert!(!result.contains("__INLINECODE_0__"));
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
#[test]
|
|
226
|
+
fn test_hashmap_order_independence() {
|
|
227
|
+
let input = "Text `a` and `b` and `c` here";
|
|
228
|
+
let mut preserved = AHashMap::new();
|
|
229
|
+
let result = extract_and_preserve_code(input, &mut preserved);
|
|
230
|
+
|
|
231
|
+
assert_eq!(preserved.len(), 3);
|
|
232
|
+
let restored = restore_preserved_blocks(&result, &preserved);
|
|
233
|
+
|
|
234
|
+
assert!(restored.contains("`a`"));
|
|
235
|
+
assert!(restored.contains("`b`"));
|
|
236
|
+
assert!(restored.contains("`c`"));
|
|
237
|
+
assert_eq!(restored, "Text `a` and `b` and `c` here");
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
#[test]
|
|
241
|
+
fn test_preserve_markdown_structure() {
|
|
242
|
+
let input = "# Header 1\n## Header 2\n### Header 3\nRegular text";
|
|
243
|
+
let result = preserve_markdown_structure(input);
|
|
244
|
+
|
|
245
|
+
assert!(result.contains("# Header 1"));
|
|
246
|
+
assert!(result.contains("## Header 2"));
|
|
247
|
+
assert!(result.contains("### Header 3"));
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
#[test]
|
|
251
|
+
fn test_is_markdown_header() {
|
|
252
|
+
assert!(is_markdown_header("# Header 1"));
|
|
253
|
+
assert!(is_markdown_header("## Header 2"));
|
|
254
|
+
assert!(is_markdown_header("### Header 3"));
|
|
255
|
+
assert!(!is_markdown_header("Regular text"));
|
|
256
|
+
assert!(!is_markdown_header("- List item"));
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
#[test]
|
|
260
|
+
fn test_is_markdown_list() {
|
|
261
|
+
assert!(is_markdown_list("- Item 1"));
|
|
262
|
+
assert!(is_markdown_list("* Item 2"));
|
|
263
|
+
assert!(is_markdown_list("+ Item 3"));
|
|
264
|
+
assert!(is_markdown_list(" - Indented item"));
|
|
265
|
+
assert!(!is_markdown_list("Regular text"));
|
|
266
|
+
assert!(!is_markdown_list("# Header"));
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
#[test]
|
|
270
|
+
fn test_is_markdown_table() {
|
|
271
|
+
assert!(is_markdown_table("| Header 1 | Header 2 |"));
|
|
272
|
+
assert!(is_markdown_table("|----------|----------|"));
|
|
273
|
+
assert!(is_markdown_table("| Cell 1 | Cell 2 |"));
|
|
274
|
+
assert!(!is_markdown_table("Regular text"));
|
|
275
|
+
assert!(!is_markdown_table("- List item"));
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
#[test]
|
|
279
|
+
fn test_lazy_regex_initialization() {
|
|
280
|
+
let _ = &*MARKDOWN_CODE_BLOCK_REGEX;
|
|
281
|
+
let _ = &*MARKDOWN_INLINE_CODE_REGEX;
|
|
282
|
+
let _ = &*MARKDOWN_HEADERS_REGEX;
|
|
283
|
+
let _ = &*MARKDOWN_LISTS_REGEX;
|
|
284
|
+
}
|
|
285
|
+
}
|