kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
//! Core RTF parsing logic.
|
|
2
|
+
|
|
3
|
+
use crate::extractors::rtf::encoding::{decode_windows_1252, parse_hex_byte, parse_rtf_control_word};
|
|
4
|
+
use crate::extractors::rtf::formatting::normalize_whitespace;
|
|
5
|
+
use crate::extractors::rtf::images::extract_image_metadata;
|
|
6
|
+
use crate::extractors::rtf::tables::TableState;
|
|
7
|
+
use crate::types::Table;
|
|
8
|
+
|
|
9
|
+
/// Extract text and image metadata from RTF document.
|
|
10
|
+
///
|
|
11
|
+
/// This function extracts plain text from an RTF document by:
|
|
12
|
+
/// 1. Tokenizing control sequences and text
|
|
13
|
+
/// 2. Converting encoded characters to Unicode
|
|
14
|
+
/// 3. Extracting text while skipping formatting groups
|
|
15
|
+
/// 4. Detecting and extracting image metadata (\pict sections)
|
|
16
|
+
/// 5. Normalizing whitespace
|
|
17
|
+
pub fn extract_text_from_rtf(content: &str) -> (String, Vec<Table>) {
|
|
18
|
+
let mut result = String::new();
|
|
19
|
+
let mut chars = content.chars().peekable();
|
|
20
|
+
let mut tables: Vec<Table> = Vec::new();
|
|
21
|
+
let mut table_state: Option<TableState> = None;
|
|
22
|
+
|
|
23
|
+
let ensure_table = |table_state: &mut Option<TableState>| {
|
|
24
|
+
if table_state.is_none() {
|
|
25
|
+
*table_state = Some(TableState::new());
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
let finalize_table = |state_opt: &mut Option<TableState>, tables: &mut Vec<Table>| {
|
|
30
|
+
if let Some(state) = state_opt.take()
|
|
31
|
+
&& let Some(table) = state.finalize()
|
|
32
|
+
{
|
|
33
|
+
tables.push(table);
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
while let Some(ch) = chars.next() {
|
|
38
|
+
match ch {
|
|
39
|
+
'\\' => {
|
|
40
|
+
if let Some(&next_ch) = chars.peek() {
|
|
41
|
+
match next_ch {
|
|
42
|
+
'\\' | '{' | '}' => {
|
|
43
|
+
chars.next();
|
|
44
|
+
result.push(next_ch);
|
|
45
|
+
}
|
|
46
|
+
'\'' => {
|
|
47
|
+
chars.next();
|
|
48
|
+
let hex1 = chars.next();
|
|
49
|
+
let hex2 = chars.next();
|
|
50
|
+
if let (Some(h1), Some(h2)) = (hex1, hex2)
|
|
51
|
+
&& let Some(byte) = parse_hex_byte(h1, h2)
|
|
52
|
+
{
|
|
53
|
+
let decoded = decode_windows_1252(byte);
|
|
54
|
+
result.push(decoded);
|
|
55
|
+
if let Some(state) = table_state.as_mut()
|
|
56
|
+
&& state.in_row
|
|
57
|
+
{
|
|
58
|
+
state.current_cell.push(decoded);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
'u' => {
|
|
63
|
+
chars.next();
|
|
64
|
+
let mut num_str = String::new();
|
|
65
|
+
while let Some(&c) = chars.peek() {
|
|
66
|
+
if c.is_ascii_digit() || c == '-' {
|
|
67
|
+
num_str.push(c);
|
|
68
|
+
chars.next();
|
|
69
|
+
} else {
|
|
70
|
+
break;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
if let Ok(code_num) = num_str.parse::<i32>() {
|
|
74
|
+
let code_u = if code_num < 0 {
|
|
75
|
+
(code_num + 65536) as u32
|
|
76
|
+
} else {
|
|
77
|
+
code_num as u32
|
|
78
|
+
};
|
|
79
|
+
if let Some(c) = char::from_u32(code_u) {
|
|
80
|
+
result.push(c);
|
|
81
|
+
if let Some(state) = table_state.as_mut()
|
|
82
|
+
&& state.in_row
|
|
83
|
+
{
|
|
84
|
+
state.current_cell.push(c);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
_ => {
|
|
90
|
+
let (control_word, _) = parse_rtf_control_word(&mut chars);
|
|
91
|
+
handle_control_word(
|
|
92
|
+
&control_word,
|
|
93
|
+
&mut chars,
|
|
94
|
+
&mut result,
|
|
95
|
+
&mut table_state,
|
|
96
|
+
&mut tables,
|
|
97
|
+
&ensure_table,
|
|
98
|
+
&finalize_table,
|
|
99
|
+
);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
'{' | '}' => {
|
|
105
|
+
if !result.is_empty() && !result.ends_with(' ') {
|
|
106
|
+
result.push(' ');
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
' ' | '\t' | '\n' | '\r' => {
|
|
110
|
+
if !result.is_empty() && !result.ends_with(' ') {
|
|
111
|
+
result.push(' ');
|
|
112
|
+
}
|
|
113
|
+
if let Some(state) = table_state.as_mut()
|
|
114
|
+
&& state.in_row
|
|
115
|
+
&& !state.current_cell.ends_with(' ')
|
|
116
|
+
{
|
|
117
|
+
state.current_cell.push(' ');
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
_ => {
|
|
121
|
+
if let Some(state) = table_state.as_ref()
|
|
122
|
+
&& !state.in_row
|
|
123
|
+
&& !state.rows.is_empty()
|
|
124
|
+
{
|
|
125
|
+
finalize_table(&mut table_state, &mut tables);
|
|
126
|
+
}
|
|
127
|
+
result.push(ch);
|
|
128
|
+
if let Some(state) = table_state.as_mut()
|
|
129
|
+
&& state.in_row
|
|
130
|
+
{
|
|
131
|
+
state.current_cell.push(ch);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if table_state.is_some() {
|
|
138
|
+
finalize_table(&mut table_state, &mut tables);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
(normalize_whitespace(&result), tables)
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/// Handle an RTF control word during parsing.
|
|
145
|
+
#[allow(clippy::too_many_arguments)]
|
|
146
|
+
fn handle_control_word(
|
|
147
|
+
control_word: &str,
|
|
148
|
+
chars: &mut std::iter::Peekable<std::str::Chars>,
|
|
149
|
+
result: &mut String,
|
|
150
|
+
table_state: &mut Option<TableState>,
|
|
151
|
+
tables: &mut Vec<Table>,
|
|
152
|
+
ensure_table: &dyn Fn(&mut Option<TableState>),
|
|
153
|
+
finalize_table: &dyn Fn(&mut Option<TableState>, &mut Vec<Table>),
|
|
154
|
+
) {
|
|
155
|
+
match control_word {
|
|
156
|
+
"pict" => {
|
|
157
|
+
let image_metadata = extract_image_metadata(chars);
|
|
158
|
+
if !image_metadata.is_empty() {
|
|
159
|
+
result.push('!');
|
|
160
|
+
result.push('[');
|
|
161
|
+
result.push_str("image");
|
|
162
|
+
result.push(']');
|
|
163
|
+
result.push('(');
|
|
164
|
+
result.push_str(&image_metadata);
|
|
165
|
+
result.push(')');
|
|
166
|
+
result.push(' ');
|
|
167
|
+
if let Some(state) = table_state.as_mut()
|
|
168
|
+
&& state.in_row
|
|
169
|
+
{
|
|
170
|
+
state.current_cell.push('!');
|
|
171
|
+
state.current_cell.push('[');
|
|
172
|
+
state.current_cell.push_str("image");
|
|
173
|
+
state.current_cell.push(']');
|
|
174
|
+
state.current_cell.push('(');
|
|
175
|
+
state.current_cell.push_str(&image_metadata);
|
|
176
|
+
state.current_cell.push(')');
|
|
177
|
+
state.current_cell.push(' ');
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
"par" => {
|
|
182
|
+
if table_state.is_some() {
|
|
183
|
+
finalize_table(table_state, tables);
|
|
184
|
+
}
|
|
185
|
+
if !result.is_empty() && !result.ends_with('\n') {
|
|
186
|
+
result.push('\n');
|
|
187
|
+
result.push('\n');
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
"tab" => {
|
|
191
|
+
result.push('\t');
|
|
192
|
+
if let Some(state) = table_state.as_mut()
|
|
193
|
+
&& state.in_row
|
|
194
|
+
{
|
|
195
|
+
state.current_cell.push('\t');
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
"bullet" => {
|
|
199
|
+
result.push('•');
|
|
200
|
+
}
|
|
201
|
+
"lquote" => {
|
|
202
|
+
result.push('\u{2018}');
|
|
203
|
+
}
|
|
204
|
+
"rquote" => {
|
|
205
|
+
result.push('\u{2019}');
|
|
206
|
+
}
|
|
207
|
+
"ldblquote" => {
|
|
208
|
+
result.push('\u{201C}');
|
|
209
|
+
}
|
|
210
|
+
"rdblquote" => {
|
|
211
|
+
result.push('\u{201D}');
|
|
212
|
+
}
|
|
213
|
+
"endash" => {
|
|
214
|
+
result.push('\u{2013}');
|
|
215
|
+
}
|
|
216
|
+
"emdash" => {
|
|
217
|
+
result.push('\u{2014}');
|
|
218
|
+
}
|
|
219
|
+
"trowd" => {
|
|
220
|
+
ensure_table(table_state);
|
|
221
|
+
if let Some(state) = table_state.as_mut() {
|
|
222
|
+
state.start_row();
|
|
223
|
+
}
|
|
224
|
+
if !result.is_empty() && !result.ends_with('\n') {
|
|
225
|
+
result.push('\n');
|
|
226
|
+
}
|
|
227
|
+
if !result.ends_with('|') {
|
|
228
|
+
result.push('|');
|
|
229
|
+
result.push(' ');
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
"cell" => {
|
|
233
|
+
if !result.ends_with('|') {
|
|
234
|
+
if !result.ends_with(' ') && !result.is_empty() {
|
|
235
|
+
result.push(' ');
|
|
236
|
+
}
|
|
237
|
+
result.push('|');
|
|
238
|
+
}
|
|
239
|
+
if !result.ends_with(' ') {
|
|
240
|
+
result.push(' ');
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
"row" => {
|
|
244
|
+
ensure_table(table_state);
|
|
245
|
+
if let Some(state) = table_state.as_mut()
|
|
246
|
+
&& (state.in_row || !state.current_cell.is_empty())
|
|
247
|
+
{
|
|
248
|
+
state.push_row();
|
|
249
|
+
}
|
|
250
|
+
if !result.ends_with('|') {
|
|
251
|
+
result.push('|');
|
|
252
|
+
}
|
|
253
|
+
if !result.ends_with('\n') {
|
|
254
|
+
result.push('\n');
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
_ => {}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
//! Table extraction and state management for RTF documents.
|
|
2
|
+
|
|
3
|
+
use crate::extraction::cells_to_markdown;
|
|
4
|
+
use crate::types::Table;
|
|
5
|
+
|
|
6
|
+
/// State machine for tracking table construction during RTF parsing.
|
|
7
|
+
pub struct TableState {
|
|
8
|
+
pub rows: Vec<Vec<String>>,
|
|
9
|
+
pub current_row: Vec<String>,
|
|
10
|
+
pub current_cell: String,
|
|
11
|
+
pub in_row: bool,
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
impl TableState {
|
|
15
|
+
/// Create a new empty table state.
|
|
16
|
+
pub fn new() -> Self {
|
|
17
|
+
Self {
|
|
18
|
+
rows: Vec::new(),
|
|
19
|
+
current_row: Vec::new(),
|
|
20
|
+
current_cell: String::new(),
|
|
21
|
+
in_row: false,
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/// Push the current cell content to the current row.
|
|
26
|
+
pub fn push_cell(&mut self) {
|
|
27
|
+
let cell = self.current_cell.trim().to_string();
|
|
28
|
+
self.current_row.push(cell);
|
|
29
|
+
self.current_cell.clear();
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/// Push the current row to the rows collection.
|
|
33
|
+
pub fn push_row(&mut self) {
|
|
34
|
+
if self.in_row || !self.current_cell.is_empty() {
|
|
35
|
+
self.push_cell();
|
|
36
|
+
self.in_row = false;
|
|
37
|
+
}
|
|
38
|
+
if !self.current_row.is_empty() {
|
|
39
|
+
self.rows.push(self.current_row.clone());
|
|
40
|
+
self.current_row.clear();
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// Start a new table row.
|
|
45
|
+
pub fn start_row(&mut self) {
|
|
46
|
+
if self.in_row {
|
|
47
|
+
self.push_row();
|
|
48
|
+
}
|
|
49
|
+
self.in_row = true;
|
|
50
|
+
self.current_cell.clear();
|
|
51
|
+
self.current_row.clear();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/// Check if this table has any content.
|
|
55
|
+
#[allow(dead_code)]
|
|
56
|
+
pub fn is_empty(&self) -> bool {
|
|
57
|
+
self.rows.is_empty() && self.current_row.is_empty() && self.current_cell.is_empty()
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/// Finalize the table and convert it to a Table struct.
|
|
61
|
+
pub fn finalize(mut self) -> Option<Table> {
|
|
62
|
+
if self.in_row || !self.current_cell.is_empty() || !self.current_row.is_empty() {
|
|
63
|
+
self.push_row();
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if self.rows.is_empty() {
|
|
67
|
+
return None;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
let markdown = cells_to_markdown(&self.rows);
|
|
71
|
+
Some(Table {
|
|
72
|
+
cells: self.rows,
|
|
73
|
+
markdown,
|
|
74
|
+
page_number: 1,
|
|
75
|
+
})
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
impl Default for TableState {
|
|
80
|
+
fn default() -> Self {
|
|
81
|
+
Self::new()
|
|
82
|
+
}
|
|
83
|
+
}
|
|
@@ -91,6 +91,8 @@ impl DocumentExtractor for PlainTextExtractor {
|
|
|
91
91
|
detected_languages: None,
|
|
92
92
|
chunks: None,
|
|
93
93
|
images: None,
|
|
94
|
+
elements: None,
|
|
95
|
+
djot_content: None,
|
|
94
96
|
})
|
|
95
97
|
}
|
|
96
98
|
|
|
@@ -184,6 +186,8 @@ impl DocumentExtractor for MarkdownExtractor {
|
|
|
184
186
|
detected_languages: None,
|
|
185
187
|
chunks: None,
|
|
186
188
|
images: None,
|
|
189
|
+
elements: None,
|
|
190
|
+
djot_content: None,
|
|
187
191
|
})
|
|
188
192
|
}
|
|
189
193
|
|
|
@@ -114,6 +114,8 @@ machine learning that uses neural networks with multiple layers.
|
|
|
114
114
|
chunks: None,
|
|
115
115
|
images: None,
|
|
116
116
|
pages: None,
|
|
117
|
+
elements: None,
|
|
118
|
+
djot_content: None,
|
|
117
119
|
};
|
|
118
120
|
|
|
119
121
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -143,6 +145,8 @@ machine learning that uses neural networks with multiple layers.
|
|
|
143
145
|
chunks: None,
|
|
144
146
|
images: None,
|
|
145
147
|
pages: None,
|
|
148
|
+
elements: None,
|
|
149
|
+
djot_content: None,
|
|
146
150
|
};
|
|
147
151
|
|
|
148
152
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -168,6 +172,8 @@ machine learning that uses neural networks with multiple layers.
|
|
|
168
172
|
chunks: None,
|
|
169
173
|
images: None,
|
|
170
174
|
pages: None,
|
|
175
|
+
elements: None,
|
|
176
|
+
djot_content: None,
|
|
171
177
|
};
|
|
172
178
|
|
|
173
179
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -193,6 +199,8 @@ machine learning that uses neural networks with multiple layers.
|
|
|
193
199
|
chunks: None,
|
|
194
200
|
images: None,
|
|
195
201
|
pages: None,
|
|
202
|
+
elements: None,
|
|
203
|
+
djot_content: None,
|
|
196
204
|
};
|
|
197
205
|
|
|
198
206
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -229,6 +237,8 @@ machine learning that uses neural networks with multiple layers.
|
|
|
229
237
|
chunks: None,
|
|
230
238
|
images: None,
|
|
231
239
|
pages: None,
|
|
240
|
+
elements: None,
|
|
241
|
+
djot_content: None,
|
|
232
242
|
};
|
|
233
243
|
|
|
234
244
|
let config_with_keywords = ExtractionConfig {
|
|
@@ -254,6 +264,8 @@ machine learning that uses neural networks with multiple layers.
|
|
|
254
264
|
chunks: None,
|
|
255
265
|
images: None,
|
|
256
266
|
pages: None,
|
|
267
|
+
elements: None,
|
|
268
|
+
djot_content: None,
|
|
257
269
|
};
|
|
258
270
|
|
|
259
271
|
let long_result = ExtractionResult {
|
|
@@ -265,6 +277,8 @@ machine learning that uses neural networks with multiple layers.
|
|
|
265
277
|
chunks: None,
|
|
266
278
|
images: None,
|
|
267
279
|
pages: None,
|
|
280
|
+
elements: None,
|
|
281
|
+
djot_content: None,
|
|
268
282
|
};
|
|
269
283
|
|
|
270
284
|
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
@@ -108,7 +108,9 @@ mod tests {
|
|
|
108
108
|
detected_languages: None,
|
|
109
109
|
chunks: None,
|
|
110
110
|
images: None,
|
|
111
|
+
djot_content: None,
|
|
111
112
|
pages: None,
|
|
113
|
+
elements: None,
|
|
112
114
|
};
|
|
113
115
|
|
|
114
116
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -132,7 +134,9 @@ mod tests {
|
|
|
132
134
|
detected_languages: None,
|
|
133
135
|
chunks: None,
|
|
134
136
|
images: None,
|
|
137
|
+
djot_content: None,
|
|
135
138
|
pages: None,
|
|
139
|
+
elements: None,
|
|
136
140
|
};
|
|
137
141
|
|
|
138
142
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -167,7 +171,9 @@ mod tests {
|
|
|
167
171
|
detected_languages: None,
|
|
168
172
|
chunks: None,
|
|
169
173
|
images: None,
|
|
174
|
+
djot_content: None,
|
|
170
175
|
pages: None,
|
|
176
|
+
elements: None,
|
|
171
177
|
};
|
|
172
178
|
|
|
173
179
|
let config_with_lang = ExtractionConfig {
|
|
@@ -196,7 +202,9 @@ mod tests {
|
|
|
196
202
|
detected_languages: None,
|
|
197
203
|
chunks: None,
|
|
198
204
|
images: None,
|
|
205
|
+
djot_content: None,
|
|
199
206
|
pages: None,
|
|
207
|
+
elements: None,
|
|
200
208
|
};
|
|
201
209
|
|
|
202
210
|
let long_result = ExtractionResult {
|
|
@@ -207,7 +215,9 @@ mod tests {
|
|
|
207
215
|
detected_languages: None,
|
|
208
216
|
chunks: None,
|
|
209
217
|
images: None,
|
|
218
|
+
djot_content: None,
|
|
210
219
|
pages: None,
|
|
220
|
+
elements: None,
|
|
211
221
|
};
|
|
212
222
|
|
|
213
223
|
let short_duration = processor.estimated_duration_ms(&short_result);
|
data/vendor/kreuzberg/src/lib.rs
CHANGED
|
@@ -89,14 +89,14 @@ pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
|
|
|
89
89
|
|
|
90
90
|
pub use core::config::{
|
|
91
91
|
ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
|
|
92
|
-
LanguageDetectionConfig, OcrConfig, PostProcessorConfig, TokenReductionConfig,
|
|
92
|
+
LanguageDetectionConfig, OcrConfig, OutputFormat, PageConfig, PostProcessorConfig, TokenReductionConfig,
|
|
93
93
|
};
|
|
94
94
|
|
|
95
95
|
#[cfg(feature = "api")]
|
|
96
96
|
pub use core::server_config::ServerConfig;
|
|
97
97
|
|
|
98
98
|
#[cfg(feature = "pdf")]
|
|
99
|
-
pub use core::config::PdfConfig;
|
|
99
|
+
pub use core::config::{HierarchyConfig, PdfConfig};
|
|
100
100
|
|
|
101
101
|
pub use core::mime::{
|
|
102
102
|
DOCX_MIME_TYPE, EXCEL_MIME_TYPE, HTML_MIME_TYPE, JSON_MIME_TYPE, MARKDOWN_MIME_TYPE, PDF_MIME_TYPE,
|