kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
//! Format-specific extraction results and OCR configuration types.
|
|
2
|
+
|
|
3
|
+
use serde::{Deserialize, Serialize};
|
|
4
|
+
use std::collections::HashMap;
|
|
5
|
+
|
|
6
|
+
use super::extraction::ExtractedImage;
|
|
7
|
+
use super::metadata::PptxMetadata;
|
|
8
|
+
use super::page::{PageContent, PageStructure};
|
|
9
|
+
|
|
10
|
+
/// Excel workbook representation.
|
|
11
|
+
///
|
|
12
|
+
/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
|
13
|
+
/// extracted content and metadata.
|
|
14
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
15
|
+
pub struct ExcelWorkbook {
|
|
16
|
+
/// All sheets in the workbook
|
|
17
|
+
pub sheets: Vec<ExcelSheet>,
|
|
18
|
+
/// Workbook-level metadata (author, creation date, etc.)
|
|
19
|
+
pub metadata: HashMap<String, String>,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/// Single Excel worksheet.
|
|
23
|
+
///
|
|
24
|
+
/// Represents one sheet from an Excel workbook with its content
|
|
25
|
+
/// converted to Markdown format and dimensional statistics.
|
|
26
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
27
|
+
pub struct ExcelSheet {
|
|
28
|
+
/// Sheet name as it appears in Excel
|
|
29
|
+
pub name: String,
|
|
30
|
+
/// Sheet content converted to Markdown tables
|
|
31
|
+
pub markdown: String,
|
|
32
|
+
/// Number of rows
|
|
33
|
+
pub row_count: usize,
|
|
34
|
+
/// Number of columns
|
|
35
|
+
pub col_count: usize,
|
|
36
|
+
/// Total number of non-empty cells
|
|
37
|
+
pub cell_count: usize,
|
|
38
|
+
/// Pre-extracted table cells (2D vector of cell values)
|
|
39
|
+
/// Populated during markdown generation to avoid re-parsing markdown.
|
|
40
|
+
/// None for empty sheets.
|
|
41
|
+
#[serde(skip)]
|
|
42
|
+
pub table_cells: Option<Vec<Vec<String>>>,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/// XML extraction result.
|
|
46
|
+
///
|
|
47
|
+
/// Contains extracted text content from XML files along with
|
|
48
|
+
/// structural statistics about the XML document.
|
|
49
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
50
|
+
pub struct XmlExtractionResult {
|
|
51
|
+
/// Extracted text content (XML structure filtered out)
|
|
52
|
+
pub content: String,
|
|
53
|
+
/// Total number of XML elements processed
|
|
54
|
+
pub element_count: usize,
|
|
55
|
+
/// List of unique element names found (sorted)
|
|
56
|
+
pub unique_elements: Vec<String>,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/// Plain text and Markdown extraction result.
|
|
60
|
+
///
|
|
61
|
+
/// Contains the extracted text along with statistics and,
|
|
62
|
+
/// for Markdown files, structural elements like headers and links.
|
|
63
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
64
|
+
pub struct TextExtractionResult {
|
|
65
|
+
/// Extracted text content
|
|
66
|
+
pub content: String,
|
|
67
|
+
/// Number of lines
|
|
68
|
+
pub line_count: usize,
|
|
69
|
+
/// Number of words
|
|
70
|
+
pub word_count: usize,
|
|
71
|
+
/// Number of characters
|
|
72
|
+
pub character_count: usize,
|
|
73
|
+
/// Markdown headers (text only, Markdown files only)
|
|
74
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
75
|
+
pub headers: Option<Vec<String>>,
|
|
76
|
+
/// Markdown links as (text, URL) tuples (Markdown files only)
|
|
77
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
78
|
+
pub links: Option<Vec<(String, String)>>,
|
|
79
|
+
/// Code blocks as (language, code) tuples (Markdown files only)
|
|
80
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
81
|
+
pub code_blocks: Option<Vec<(String, String)>>,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/// PowerPoint (PPTX) extraction result.
|
|
85
|
+
///
|
|
86
|
+
/// Contains extracted slide content, metadata, and embedded images/tables.
|
|
87
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
88
|
+
pub struct PptxExtractionResult {
|
|
89
|
+
/// Extracted text content from all slides
|
|
90
|
+
pub content: String,
|
|
91
|
+
/// Presentation metadata
|
|
92
|
+
pub metadata: PptxMetadata,
|
|
93
|
+
/// Total number of slides
|
|
94
|
+
pub slide_count: usize,
|
|
95
|
+
/// Total number of embedded images
|
|
96
|
+
pub image_count: usize,
|
|
97
|
+
/// Total number of tables
|
|
98
|
+
pub table_count: usize,
|
|
99
|
+
/// Extracted images from the presentation
|
|
100
|
+
pub images: Vec<ExtractedImage>,
|
|
101
|
+
/// Slide structure with boundaries (when page tracking is enabled)
|
|
102
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
103
|
+
pub page_structure: Option<PageStructure>,
|
|
104
|
+
/// Per-slide content (when page tracking is enabled)
|
|
105
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
106
|
+
pub page_contents: Option<Vec<PageContent>>,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/// Email extraction result.
|
|
110
|
+
///
|
|
111
|
+
/// Complete representation of an extracted email message (.eml or .msg)
|
|
112
|
+
/// including headers, body content, and attachments.
|
|
113
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
114
|
+
pub struct EmailExtractionResult {
|
|
115
|
+
/// Email subject line
|
|
116
|
+
pub subject: Option<String>,
|
|
117
|
+
/// Sender email address
|
|
118
|
+
pub from_email: Option<String>,
|
|
119
|
+
/// Primary recipient email addresses
|
|
120
|
+
pub to_emails: Vec<String>,
|
|
121
|
+
/// CC recipient email addresses
|
|
122
|
+
pub cc_emails: Vec<String>,
|
|
123
|
+
/// BCC recipient email addresses
|
|
124
|
+
pub bcc_emails: Vec<String>,
|
|
125
|
+
/// Email date/timestamp
|
|
126
|
+
pub date: Option<String>,
|
|
127
|
+
/// Message-ID header value
|
|
128
|
+
pub message_id: Option<String>,
|
|
129
|
+
/// Plain text version of the email body
|
|
130
|
+
pub plain_text: Option<String>,
|
|
131
|
+
/// HTML version of the email body
|
|
132
|
+
pub html_content: Option<String>,
|
|
133
|
+
/// Cleaned/processed text content
|
|
134
|
+
pub cleaned_text: String,
|
|
135
|
+
/// List of email attachments
|
|
136
|
+
pub attachments: Vec<EmailAttachment>,
|
|
137
|
+
/// Additional email headers and metadata
|
|
138
|
+
pub metadata: HashMap<String, String>,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/// Email attachment representation.
|
|
142
|
+
///
|
|
143
|
+
/// Contains metadata and optionally the content of an email attachment.
|
|
144
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
145
|
+
pub struct EmailAttachment {
|
|
146
|
+
/// Attachment name (from Content-Disposition header)
|
|
147
|
+
pub name: Option<String>,
|
|
148
|
+
/// Filename of the attachment
|
|
149
|
+
pub filename: Option<String>,
|
|
150
|
+
/// MIME type of the attachment
|
|
151
|
+
pub mime_type: Option<String>,
|
|
152
|
+
/// Size in bytes
|
|
153
|
+
pub size: Option<usize>,
|
|
154
|
+
/// Whether this attachment is an image
|
|
155
|
+
pub is_image: bool,
|
|
156
|
+
/// Attachment data (if extracted)
|
|
157
|
+
pub data: Option<Vec<u8>>,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/// OCR extraction result.
|
|
161
|
+
///
|
|
162
|
+
/// Result of performing OCR on an image or scanned document,
|
|
163
|
+
/// including recognized text and detected tables.
|
|
164
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
165
|
+
pub struct OcrExtractionResult {
|
|
166
|
+
/// Recognized text content
|
|
167
|
+
pub content: String,
|
|
168
|
+
/// Original MIME type of the processed image
|
|
169
|
+
pub mime_type: String,
|
|
170
|
+
/// OCR processing metadata (confidence scores, language, etc.)
|
|
171
|
+
pub metadata: HashMap<String, serde_json::Value>,
|
|
172
|
+
/// Tables detected and extracted via OCR
|
|
173
|
+
pub tables: Vec<OcrTable>,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/// Table detected via OCR.
|
|
177
|
+
///
|
|
178
|
+
/// Represents a table structure recognized during OCR processing.
|
|
179
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
180
|
+
pub struct OcrTable {
|
|
181
|
+
/// Table cells as a 2D vector (rows × columns)
|
|
182
|
+
pub cells: Vec<Vec<String>>,
|
|
183
|
+
/// Markdown representation of the table
|
|
184
|
+
pub markdown: String,
|
|
185
|
+
/// Page number where the table was found (1-indexed)
|
|
186
|
+
pub page_number: usize,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/// Image preprocessing configuration for OCR.
|
|
190
|
+
///
|
|
191
|
+
/// These settings control how images are preprocessed before OCR to improve
|
|
192
|
+
/// text recognition quality. Different preprocessing strategies work better
|
|
193
|
+
/// for different document types.
|
|
194
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
195
|
+
#[serde(default)]
|
|
196
|
+
pub struct ImagePreprocessingConfig {
|
|
197
|
+
/// Target DPI for the image (300 is standard, 600 for small text).
|
|
198
|
+
pub target_dpi: i32,
|
|
199
|
+
|
|
200
|
+
/// Auto-detect and correct image rotation.
|
|
201
|
+
pub auto_rotate: bool,
|
|
202
|
+
|
|
203
|
+
/// Correct skew (tilted images).
|
|
204
|
+
pub deskew: bool,
|
|
205
|
+
|
|
206
|
+
/// Remove noise from the image.
|
|
207
|
+
pub denoise: bool,
|
|
208
|
+
|
|
209
|
+
/// Enhance contrast for better text visibility.
|
|
210
|
+
pub contrast_enhance: bool,
|
|
211
|
+
|
|
212
|
+
/// Binarization method: "otsu", "sauvola", "adaptive".
|
|
213
|
+
pub binarization_method: String,
|
|
214
|
+
|
|
215
|
+
/// Invert colors (white text on black → black on white).
|
|
216
|
+
pub invert_colors: bool,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
impl Default for ImagePreprocessingConfig {
|
|
220
|
+
fn default() -> Self {
|
|
221
|
+
Self {
|
|
222
|
+
target_dpi: 300,
|
|
223
|
+
auto_rotate: true,
|
|
224
|
+
deskew: true,
|
|
225
|
+
denoise: false,
|
|
226
|
+
contrast_enhance: false,
|
|
227
|
+
binarization_method: "otsu".to_string(),
|
|
228
|
+
invert_colors: false,
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
/// Tesseract OCR configuration.
|
|
234
|
+
///
|
|
235
|
+
/// Provides fine-grained control over Tesseract OCR engine parameters.
|
|
236
|
+
/// Most users can use the defaults, but these settings allow optimization
|
|
237
|
+
/// for specific document types (invoices, handwriting, etc.).
|
|
238
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
239
|
+
#[serde(default)]
|
|
240
|
+
pub struct TesseractConfig {
|
|
241
|
+
/// Language code (e.g., "eng", "deu", "fra")
|
|
242
|
+
pub language: String,
|
|
243
|
+
|
|
244
|
+
/// Page Segmentation Mode (0-13).
|
|
245
|
+
///
|
|
246
|
+
/// Common values:
|
|
247
|
+
/// - 3: Fully automatic page segmentation (default)
|
|
248
|
+
/// - 6: Assume a single uniform block of text
|
|
249
|
+
/// - 11: Sparse text with no particular order
|
|
250
|
+
pub psm: i32,
|
|
251
|
+
|
|
252
|
+
/// Output format ("text" or "markdown")
|
|
253
|
+
pub output_format: String,
|
|
254
|
+
|
|
255
|
+
/// OCR Engine Mode (0-3).
|
|
256
|
+
///
|
|
257
|
+
/// - 0: Legacy engine only
|
|
258
|
+
/// - 1: Neural nets (LSTM) only (usually best)
|
|
259
|
+
/// - 2: Legacy + LSTM
|
|
260
|
+
/// - 3: Default (based on what's available)
|
|
261
|
+
pub oem: i32,
|
|
262
|
+
|
|
263
|
+
/// Minimum confidence threshold (0.0-100.0).
|
|
264
|
+
///
|
|
265
|
+
/// Words with confidence below this threshold may be rejected or flagged.
|
|
266
|
+
pub min_confidence: f64,
|
|
267
|
+
|
|
268
|
+
/// Image preprocessing configuration.
|
|
269
|
+
///
|
|
270
|
+
/// Controls how images are preprocessed before OCR. Can significantly
|
|
271
|
+
/// improve quality for scanned documents or low-quality images.
|
|
272
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
273
|
+
pub preprocessing: Option<ImagePreprocessingConfig>,
|
|
274
|
+
|
|
275
|
+
/// Enable automatic table detection and reconstruction
|
|
276
|
+
pub enable_table_detection: bool,
|
|
277
|
+
|
|
278
|
+
/// Minimum confidence threshold for table detection (0.0-1.0)
|
|
279
|
+
pub table_min_confidence: f64,
|
|
280
|
+
|
|
281
|
+
/// Column threshold for table detection (pixels)
|
|
282
|
+
pub table_column_threshold: i32,
|
|
283
|
+
|
|
284
|
+
/// Row threshold ratio for table detection (0.0-1.0)
|
|
285
|
+
pub table_row_threshold_ratio: f64,
|
|
286
|
+
|
|
287
|
+
/// Enable OCR result caching
|
|
288
|
+
pub use_cache: bool,
|
|
289
|
+
|
|
290
|
+
/// Use pre-adapted templates for character classification
|
|
291
|
+
pub classify_use_pre_adapted_templates: bool,
|
|
292
|
+
|
|
293
|
+
/// Enable N-gram language model
|
|
294
|
+
pub language_model_ngram_on: bool,
|
|
295
|
+
|
|
296
|
+
/// Don't reject good words during block-level processing
|
|
297
|
+
pub tessedit_dont_blkrej_good_wds: bool,
|
|
298
|
+
|
|
299
|
+
/// Don't reject good words during row-level processing
|
|
300
|
+
pub tessedit_dont_rowrej_good_wds: bool,
|
|
301
|
+
|
|
302
|
+
/// Enable dictionary correction
|
|
303
|
+
pub tessedit_enable_dict_correction: bool,
|
|
304
|
+
|
|
305
|
+
/// Whitelist of allowed characters (empty = all allowed)
|
|
306
|
+
pub tessedit_char_whitelist: String,
|
|
307
|
+
|
|
308
|
+
/// Blacklist of forbidden characters (empty = none forbidden)
|
|
309
|
+
pub tessedit_char_blacklist: String,
|
|
310
|
+
|
|
311
|
+
/// Use primary language params model
|
|
312
|
+
pub tessedit_use_primary_params_model: bool,
|
|
313
|
+
|
|
314
|
+
/// Variable-width space detection
|
|
315
|
+
pub textord_space_size_is_variable: bool,
|
|
316
|
+
|
|
317
|
+
/// Use adaptive thresholding method
|
|
318
|
+
pub thresholding_method: bool,
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
impl Default for TesseractConfig {
|
|
322
|
+
fn default() -> Self {
|
|
323
|
+
Self {
|
|
324
|
+
language: "eng".to_string(),
|
|
325
|
+
psm: 3,
|
|
326
|
+
output_format: "markdown".to_string(),
|
|
327
|
+
oem: 3,
|
|
328
|
+
min_confidence: 0.0,
|
|
329
|
+
preprocessing: None,
|
|
330
|
+
enable_table_detection: true,
|
|
331
|
+
table_min_confidence: 0.0,
|
|
332
|
+
table_column_threshold: 50,
|
|
333
|
+
table_row_threshold_ratio: 0.5,
|
|
334
|
+
use_cache: true,
|
|
335
|
+
classify_use_pre_adapted_templates: true,
|
|
336
|
+
language_model_ngram_on: false,
|
|
337
|
+
tessedit_dont_blkrej_good_wds: true,
|
|
338
|
+
tessedit_dont_rowrej_good_wds: true,
|
|
339
|
+
tessedit_enable_dict_correction: true,
|
|
340
|
+
tessedit_char_whitelist: String::new(),
|
|
341
|
+
tessedit_char_blacklist: String::new(),
|
|
342
|
+
tessedit_use_primary_params_model: true,
|
|
343
|
+
textord_space_size_is_variable: true,
|
|
344
|
+
thresholding_method: false,
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
/// Image preprocessing metadata.
|
|
350
|
+
///
|
|
351
|
+
/// Tracks the transformations applied to an image during OCR preprocessing,
|
|
352
|
+
/// including DPI normalization, resizing, and resampling.
|
|
353
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
354
|
+
pub struct ImagePreprocessingMetadata {
|
|
355
|
+
/// Original image dimensions (width, height) in pixels
|
|
356
|
+
pub original_dimensions: (usize, usize),
|
|
357
|
+
/// Original image DPI (horizontal, vertical)
|
|
358
|
+
pub original_dpi: (f64, f64),
|
|
359
|
+
/// Target DPI from configuration
|
|
360
|
+
pub target_dpi: i32,
|
|
361
|
+
/// Scaling factor applied to the image
|
|
362
|
+
pub scale_factor: f64,
|
|
363
|
+
/// Whether DPI was auto-adjusted based on content
|
|
364
|
+
pub auto_adjusted: bool,
|
|
365
|
+
/// Final DPI after processing
|
|
366
|
+
pub final_dpi: i32,
|
|
367
|
+
/// New dimensions after resizing (if resized)
|
|
368
|
+
pub new_dimensions: Option<(usize, usize)>,
|
|
369
|
+
/// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
|
|
370
|
+
pub resample_method: String,
|
|
371
|
+
/// Whether dimensions were clamped to max_image_dimension
|
|
372
|
+
pub dimension_clamped: bool,
|
|
373
|
+
/// Calculated optimal DPI (if auto_adjust_dpi enabled)
|
|
374
|
+
pub calculated_dpi: Option<i32>,
|
|
375
|
+
/// Whether resize was skipped (dimensions already optimal)
|
|
376
|
+
pub skipped_resize: bool,
|
|
377
|
+
/// Error message if resize failed
|
|
378
|
+
pub resize_error: Option<String>,
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
/// Image extraction configuration (internal use).
|
|
382
|
+
///
|
|
383
|
+
/// **Note:** This is an internal type used for image preprocessing.
|
|
384
|
+
/// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
|
|
385
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
386
|
+
pub struct ExtractionConfig {
|
|
387
|
+
/// Target DPI for image normalization
|
|
388
|
+
pub target_dpi: i32,
|
|
389
|
+
/// Maximum image dimension (width or height)
|
|
390
|
+
pub max_image_dimension: i32,
|
|
391
|
+
/// Whether to auto-adjust DPI based on content
|
|
392
|
+
pub auto_adjust_dpi: bool,
|
|
393
|
+
/// Minimum DPI threshold
|
|
394
|
+
pub min_dpi: i32,
|
|
395
|
+
/// Maximum DPI threshold
|
|
396
|
+
pub max_dpi: i32,
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
impl Default for ExtractionConfig {
|
|
400
|
+
fn default() -> Self {
|
|
401
|
+
Self {
|
|
402
|
+
target_dpi: 300,
|
|
403
|
+
max_image_dimension: 4096,
|
|
404
|
+
auto_adjust_dpi: true,
|
|
405
|
+
min_dpi: 72,
|
|
406
|
+
max_dpi: 600,
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
/// Cache statistics.
|
|
412
|
+
///
|
|
413
|
+
/// Provides information about the extraction result cache,
|
|
414
|
+
/// including size, file count, and age distribution.
|
|
415
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
416
|
+
pub struct CacheStats {
|
|
417
|
+
/// Total number of cached files
|
|
418
|
+
pub total_files: usize,
|
|
419
|
+
/// Total cache size in megabytes
|
|
420
|
+
pub total_size_mb: f64,
|
|
421
|
+
/// Available disk space in megabytes
|
|
422
|
+
pub available_space_mb: f64,
|
|
423
|
+
/// Age of the oldest cached file in days
|
|
424
|
+
pub oldest_file_age_days: f64,
|
|
425
|
+
/// Age of the newest cached file in days
|
|
426
|
+
pub newest_file_age_days: f64,
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
/// LibreOffice conversion result.
|
|
430
|
+
///
|
|
431
|
+
/// Result of converting a legacy office document (e.g., .doc, .ppt)
|
|
432
|
+
/// to a modern format using LibreOffice.
|
|
433
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
434
|
+
pub struct LibreOfficeConversionResult {
|
|
435
|
+
/// Converted file bytes
|
|
436
|
+
pub converted_bytes: Vec<u8>,
|
|
437
|
+
/// Original format identifier
|
|
438
|
+
pub original_format: String,
|
|
439
|
+
/// Target format identifier
|
|
440
|
+
pub target_format: String,
|
|
441
|
+
/// Target MIME type after conversion
|
|
442
|
+
pub target_mime: String,
|
|
443
|
+
}
|