kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -1,173 +1,21 @@
|
|
|
1
|
-
//! PDF text hierarchy extraction
|
|
1
|
+
//! PDF text hierarchy extraction and text block analysis.
|
|
2
2
|
//!
|
|
3
3
|
//! This module provides functions for extracting character information from PDFs,
|
|
4
|
-
//!
|
|
5
|
-
//!
|
|
6
|
-
//! Note: Requires the "pdf" feature to be enabled.
|
|
4
|
+
//! merging characters into text blocks, and assigning hierarchy levels based on
|
|
5
|
+
//! font size analysis.
|
|
7
6
|
|
|
8
|
-
use super::
|
|
7
|
+
use super::bounding_box::BoundingBox;
|
|
8
|
+
use super::clustering::FontSizeCluster;
|
|
9
9
|
use crate::core::config::ExtractionConfig;
|
|
10
|
+
use crate::pdf::error::{PdfError, Result};
|
|
10
11
|
use pdfium_render::prelude::*;
|
|
11
12
|
|
|
12
13
|
// Magic number constants
|
|
13
14
|
const DEFAULT_FONT_SIZE: f32 = 12.0;
|
|
14
|
-
const WEIGHTED_DISTANCE_X_WEIGHT: f32 = 5.0;
|
|
15
|
-
const WEIGHTED_DISTANCE_Y_WEIGHT: f32 = 1.0;
|
|
16
|
-
const KMEANS_MAX_ITERATIONS: usize = 100;
|
|
17
|
-
const KMEANS_CONVERGENCE_THRESHOLD: f32 = 0.01;
|
|
18
15
|
const MERGE_INTERSECTION_THRESHOLD: f32 = 0.05;
|
|
19
16
|
const MERGE_X_THRESHOLD_MULTIPLIER: f32 = 2.0;
|
|
20
17
|
const MERGE_Y_THRESHOLD_MULTIPLIER: f32 = 1.5;
|
|
21
18
|
|
|
22
|
-
/// A bounding box for text or elements.
|
|
23
|
-
#[derive(Debug, Clone, Copy, PartialEq)]
|
|
24
|
-
pub struct BoundingBox {
|
|
25
|
-
/// Left x-coordinate
|
|
26
|
-
pub left: f32,
|
|
27
|
-
/// Top y-coordinate
|
|
28
|
-
pub top: f32,
|
|
29
|
-
/// Right x-coordinate
|
|
30
|
-
pub right: f32,
|
|
31
|
-
/// Bottom y-coordinate
|
|
32
|
-
pub bottom: f32,
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
impl BoundingBox {
|
|
36
|
-
/// Calculate the Intersection over Union (IOU) between this bounding box and another.
|
|
37
|
-
///
|
|
38
|
-
/// IOU = intersection_area / union_area
|
|
39
|
-
///
|
|
40
|
-
/// # Arguments
|
|
41
|
-
///
|
|
42
|
-
/// * `other` - The other bounding box to compare with
|
|
43
|
-
///
|
|
44
|
-
/// # Returns
|
|
45
|
-
///
|
|
46
|
-
/// The IOU value between 0.0 and 1.0
|
|
47
|
-
pub fn iou(&self, other: &BoundingBox) -> f32 {
|
|
48
|
-
let intersection_area = self.calculate_intersection_area(other);
|
|
49
|
-
let self_area = self.calculate_area();
|
|
50
|
-
let other_area = other.calculate_area();
|
|
51
|
-
let union_area = self_area + other_area - intersection_area;
|
|
52
|
-
|
|
53
|
-
if union_area <= 0.0 {
|
|
54
|
-
0.0
|
|
55
|
-
} else {
|
|
56
|
-
intersection_area / union_area
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
/// Calculate the weighted distance between the centers of two bounding boxes.
|
|
61
|
-
///
|
|
62
|
-
/// The distance is weighted with X-axis having weight 5.0 and Y-axis having weight 1.0.
|
|
63
|
-
/// This reflects the greater importance of horizontal distance in text layout.
|
|
64
|
-
///
|
|
65
|
-
/// # Arguments
|
|
66
|
-
///
|
|
67
|
-
/// * `other` - The other bounding box to compare with
|
|
68
|
-
///
|
|
69
|
-
/// # Returns
|
|
70
|
-
///
|
|
71
|
-
/// The weighted distance value
|
|
72
|
-
pub fn weighted_distance(&self, other: &BoundingBox) -> f32 {
|
|
73
|
-
let (self_center_x, self_center_y) = self.center();
|
|
74
|
-
let (other_center_x, other_center_y) = other.center();
|
|
75
|
-
|
|
76
|
-
let dx = (self_center_x - other_center_x).abs();
|
|
77
|
-
let dy = (self_center_y - other_center_y).abs();
|
|
78
|
-
|
|
79
|
-
dx * WEIGHTED_DISTANCE_X_WEIGHT + dy * WEIGHTED_DISTANCE_Y_WEIGHT
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
/// Calculate the intersection ratio relative to this bounding box's area.
|
|
83
|
-
///
|
|
84
|
-
/// intersection_ratio = intersection_area / self_area
|
|
85
|
-
///
|
|
86
|
-
/// # Arguments
|
|
87
|
-
///
|
|
88
|
-
/// * `other` - The other bounding box to compare with
|
|
89
|
-
///
|
|
90
|
-
/// # Returns
|
|
91
|
-
///
|
|
92
|
-
/// The intersection ratio between 0.0 and 1.0
|
|
93
|
-
pub fn intersection_ratio(&self, other: &BoundingBox) -> f32 {
|
|
94
|
-
let intersection_area = self.calculate_intersection_area(other);
|
|
95
|
-
let self_area = self.calculate_area();
|
|
96
|
-
|
|
97
|
-
if self_area <= 0.0 {
|
|
98
|
-
0.0
|
|
99
|
-
} else {
|
|
100
|
-
intersection_area / self_area
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
/// Check if this bounding box contains another bounding box.
|
|
105
|
-
pub fn contains(&self, other: &BoundingBox) -> bool {
|
|
106
|
-
other.left >= self.left && other.right <= self.right && other.top >= self.top && other.bottom <= self.bottom
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
/// Calculate the center coordinates of this bounding box.
|
|
110
|
-
pub fn center(&self) -> (f32, f32) {
|
|
111
|
-
((self.left + self.right) / 2.0, (self.top + self.bottom) / 2.0)
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
/// Merge this bounding box with another, creating a box that contains both.
|
|
115
|
-
pub fn merge(&self, other: &BoundingBox) -> BoundingBox {
|
|
116
|
-
BoundingBox {
|
|
117
|
-
left: self.left.min(other.left),
|
|
118
|
-
top: self.top.min(other.top),
|
|
119
|
-
right: self.right.max(other.right),
|
|
120
|
-
bottom: self.bottom.max(other.bottom),
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
/// Calculate a relaxed IOU with an expansion factor.
|
|
125
|
-
pub fn relaxed_iou(&self, other: &BoundingBox, relaxation: f32) -> f32 {
|
|
126
|
-
let self_width = self.right - self.left;
|
|
127
|
-
let self_height = self.bottom - self.top;
|
|
128
|
-
let self_expansion = relaxation * self_width.min(self_height).max(0.0);
|
|
129
|
-
|
|
130
|
-
let other_width = other.right - other.left;
|
|
131
|
-
let other_height = other.bottom - other.top;
|
|
132
|
-
let other_expansion = relaxation * other_width.min(other_height).max(0.0);
|
|
133
|
-
|
|
134
|
-
let expanded_self = BoundingBox {
|
|
135
|
-
left: (self.left - self_expansion).max(0.0),
|
|
136
|
-
top: (self.top - self_expansion).max(0.0),
|
|
137
|
-
right: self.right + self_expansion,
|
|
138
|
-
bottom: self.bottom + self_expansion,
|
|
139
|
-
};
|
|
140
|
-
|
|
141
|
-
let expanded_other = BoundingBox {
|
|
142
|
-
left: (other.left - other_expansion).max(0.0),
|
|
143
|
-
top: (other.top - other_expansion).max(0.0),
|
|
144
|
-
right: other.right + other_expansion,
|
|
145
|
-
bottom: other.bottom + other_expansion,
|
|
146
|
-
};
|
|
147
|
-
|
|
148
|
-
expanded_self.iou(&expanded_other)
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
/// Calculate the area of this bounding box.
|
|
152
|
-
fn calculate_area(&self) -> f32 {
|
|
153
|
-
let width = (self.right - self.left).max(0.0);
|
|
154
|
-
let height = (self.bottom - self.top).max(0.0);
|
|
155
|
-
width * height
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
/// Calculate the intersection area between this bounding box and another.
|
|
159
|
-
fn calculate_intersection_area(&self, other: &BoundingBox) -> f32 {
|
|
160
|
-
let left = self.left.max(other.left);
|
|
161
|
-
let top = self.top.max(other.top);
|
|
162
|
-
let right = self.right.min(other.right);
|
|
163
|
-
let bottom = self.bottom.min(other.bottom);
|
|
164
|
-
|
|
165
|
-
let width = (right - left).max(0.0);
|
|
166
|
-
let height = (bottom - top).max(0.0);
|
|
167
|
-
width * height
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
|
|
171
19
|
/// Character information extracted from PDF with font metrics.
|
|
172
20
|
#[derive(Debug, Clone)]
|
|
173
21
|
pub struct CharData {
|
|
@@ -196,15 +44,6 @@ pub struct TextBlock {
|
|
|
196
44
|
pub font_size: f32,
|
|
197
45
|
}
|
|
198
46
|
|
|
199
|
-
/// A cluster of text blocks with the same font size characteristics.
|
|
200
|
-
#[derive(Debug, Clone)]
|
|
201
|
-
pub struct FontSizeCluster {
|
|
202
|
-
/// The centroid (mean) font size of this cluster
|
|
203
|
-
pub centroid: f32,
|
|
204
|
-
/// The text blocks that belong to this cluster
|
|
205
|
-
pub members: Vec<TextBlock>,
|
|
206
|
-
}
|
|
207
|
-
|
|
208
47
|
/// Result of KMeans clustering on font sizes.
|
|
209
48
|
///
|
|
210
49
|
/// Contains cluster labels for each block, where cluster index indicates
|
|
@@ -401,185 +240,6 @@ pub fn assign_hierarchy_levels_from_clusters(
|
|
|
401
240
|
result
|
|
402
241
|
}
|
|
403
242
|
|
|
404
|
-
/// Cluster text blocks by font size using k-means algorithm.
|
|
405
|
-
///
|
|
406
|
-
/// Uses k-means clustering to group text blocks by their font size, which helps
|
|
407
|
-
/// identify document hierarchy levels (H1, H2, Body, etc.). The algorithm:
|
|
408
|
-
/// 1. Extracts font sizes from text blocks
|
|
409
|
-
/// 2. Applies k-means clustering to group similar font sizes
|
|
410
|
-
/// 3. Sorts clusters by centroid size in descending order (largest = H1)
|
|
411
|
-
/// 4. Returns clusters with their member blocks
|
|
412
|
-
///
|
|
413
|
-
/// # Arguments
|
|
414
|
-
///
|
|
415
|
-
/// * `blocks` - Slice of TextBlock objects to cluster
|
|
416
|
-
/// * `k` - Number of clusters to create
|
|
417
|
-
///
|
|
418
|
-
/// # Returns
|
|
419
|
-
///
|
|
420
|
-
/// Result with vector of FontSizeCluster ordered by size (descending),
|
|
421
|
-
/// or an error if clustering fails
|
|
422
|
-
///
|
|
423
|
-
/// # Example
|
|
424
|
-
///
|
|
425
|
-
/// ```rust,no_run
|
|
426
|
-
/// # #[cfg(feature = "pdf")]
|
|
427
|
-
/// # {
|
|
428
|
-
/// use kreuzberg::pdf::hierarchy::{TextBlock, BoundingBox, cluster_font_sizes};
|
|
429
|
-
///
|
|
430
|
-
/// let blocks = vec![
|
|
431
|
-
/// TextBlock {
|
|
432
|
-
/// text: "Title".to_string(),
|
|
433
|
-
/// bbox: BoundingBox { left: 0.0, top: 0.0, right: 100.0, bottom: 24.0 },
|
|
434
|
-
/// font_size: 24.0,
|
|
435
|
-
/// },
|
|
436
|
-
/// TextBlock {
|
|
437
|
-
/// text: "Body".to_string(),
|
|
438
|
-
/// bbox: BoundingBox { left: 0.0, top: 30.0, right: 100.0, bottom: 42.0 },
|
|
439
|
-
/// font_size: 12.0,
|
|
440
|
-
/// },
|
|
441
|
-
/// ];
|
|
442
|
-
///
|
|
443
|
-
/// let clusters = cluster_font_sizes(&blocks, 2).unwrap();
|
|
444
|
-
/// assert_eq!(clusters.len(), 2);
|
|
445
|
-
/// assert_eq!(clusters[0].centroid, 24.0); // Largest is first
|
|
446
|
-
/// # }
|
|
447
|
-
/// ```
|
|
448
|
-
/// Helper function to assign blocks to their nearest centroid.
|
|
449
|
-
///
|
|
450
|
-
/// Iterates through blocks and finds the closest centroid for each block,
|
|
451
|
-
/// grouping them into clusters. Used in k-means clustering iterations.
|
|
452
|
-
///
|
|
453
|
-
/// # Arguments
|
|
454
|
-
///
|
|
455
|
-
/// * `blocks` - Slice of TextBlock objects to assign
|
|
456
|
-
/// * `centroids` - Slice of centroid values (one per cluster)
|
|
457
|
-
///
|
|
458
|
-
/// # Returns
|
|
459
|
-
///
|
|
460
|
-
/// A vector of clusters, where each cluster contains the TextBlock objects
|
|
461
|
-
/// assigned to that centroid
|
|
462
|
-
fn assign_blocks_to_centroids(blocks: &[TextBlock], centroids: &[f32]) -> Vec<Vec<TextBlock>> {
|
|
463
|
-
let mut clusters: Vec<Vec<TextBlock>> = vec![Vec::new(); centroids.len()];
|
|
464
|
-
|
|
465
|
-
for block in blocks {
|
|
466
|
-
let mut min_distance = f32::INFINITY;
|
|
467
|
-
let mut best_cluster = 0;
|
|
468
|
-
|
|
469
|
-
for (i, ¢roid) in centroids.iter().enumerate() {
|
|
470
|
-
let distance = (block.font_size - centroid).abs();
|
|
471
|
-
if distance < min_distance {
|
|
472
|
-
min_distance = distance;
|
|
473
|
-
best_cluster = i;
|
|
474
|
-
}
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
clusters[best_cluster].push(block.clone());
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
clusters
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
pub fn cluster_font_sizes(blocks: &[TextBlock], k: usize) -> Result<Vec<FontSizeCluster>> {
|
|
484
|
-
if blocks.is_empty() {
|
|
485
|
-
return Ok(Vec::new());
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
if k == 0 {
|
|
489
|
-
return Err(PdfError::TextExtractionFailed("K must be greater than 0".to_string()));
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
let actual_k = k.min(blocks.len());
|
|
493
|
-
|
|
494
|
-
// Extract unique font sizes for initialization
|
|
495
|
-
let mut font_sizes: Vec<f32> = blocks.iter().map(|b| b.font_size).collect();
|
|
496
|
-
font_sizes.sort_by(|a, b| b.partial_cmp(a).expect("Failed to compare font sizes during sorting")); // Sort descending
|
|
497
|
-
font_sizes.dedup(); // Remove duplicates to get unique font sizes
|
|
498
|
-
|
|
499
|
-
// Initialize centroids using actual font sizes from the data
|
|
500
|
-
// This is more robust than dividing the range uniformly
|
|
501
|
-
let mut centroids: Vec<f32> = Vec::new();
|
|
502
|
-
|
|
503
|
-
if font_sizes.len() >= actual_k {
|
|
504
|
-
// If we have at least k unique font sizes, pick them evenly spaced
|
|
505
|
-
let step = font_sizes.len() / actual_k;
|
|
506
|
-
for i in 0..actual_k {
|
|
507
|
-
let idx = i * step;
|
|
508
|
-
centroids.push(font_sizes[idx.min(font_sizes.len() - 1)]);
|
|
509
|
-
}
|
|
510
|
-
} else {
|
|
511
|
-
// If we have fewer unique sizes than k, use all of them and fill with interpolated values
|
|
512
|
-
centroids = font_sizes.clone();
|
|
513
|
-
|
|
514
|
-
// Add interpolated centroids between existing ones to reach desired k
|
|
515
|
-
let min_font = font_sizes[font_sizes.len() - 1];
|
|
516
|
-
let max_font = font_sizes[0];
|
|
517
|
-
let range = max_font - min_font;
|
|
518
|
-
|
|
519
|
-
while centroids.len() < actual_k {
|
|
520
|
-
let t = centroids.len() as f32 / (actual_k - 1) as f32;
|
|
521
|
-
let interpolated = max_font - t * range;
|
|
522
|
-
centroids.push(interpolated);
|
|
523
|
-
}
|
|
524
|
-
|
|
525
|
-
centroids.sort_by(|a, b| b.partial_cmp(a).expect("Failed to compare centroids during sorting"));
|
|
526
|
-
// Keep sorted descending
|
|
527
|
-
}
|
|
528
|
-
|
|
529
|
-
// Run k-means clustering for a fixed number of iterations
|
|
530
|
-
for _ in 0..KMEANS_MAX_ITERATIONS {
|
|
531
|
-
// Assign blocks to nearest centroid
|
|
532
|
-
let clusters = assign_blocks_to_centroids(blocks, ¢roids);
|
|
533
|
-
|
|
534
|
-
// Update centroids
|
|
535
|
-
let mut new_centroids = Vec::with_capacity(actual_k);
|
|
536
|
-
for (i, cluster) in clusters.iter().enumerate() {
|
|
537
|
-
if !cluster.is_empty() {
|
|
538
|
-
new_centroids.push(cluster.iter().map(|b| b.font_size).sum::<f32>() / cluster.len() as f32);
|
|
539
|
-
} else {
|
|
540
|
-
new_centroids.push(centroids[i]);
|
|
541
|
-
}
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
// Check for convergence
|
|
545
|
-
let converged = centroids
|
|
546
|
-
.iter()
|
|
547
|
-
.zip(new_centroids.iter())
|
|
548
|
-
.all(|(old, new)| (old - new).abs() < KMEANS_CONVERGENCE_THRESHOLD);
|
|
549
|
-
|
|
550
|
-
std::mem::swap(&mut centroids, &mut new_centroids);
|
|
551
|
-
|
|
552
|
-
if converged {
|
|
553
|
-
break;
|
|
554
|
-
}
|
|
555
|
-
}
|
|
556
|
-
|
|
557
|
-
// Final assignment to create result
|
|
558
|
-
let clusters = assign_blocks_to_centroids(blocks, ¢roids);
|
|
559
|
-
|
|
560
|
-
// Create FontSizeCluster objects with centroids
|
|
561
|
-
let mut result: Vec<FontSizeCluster> = Vec::new();
|
|
562
|
-
|
|
563
|
-
for i in 0..actual_k {
|
|
564
|
-
if !clusters[i].is_empty() {
|
|
565
|
-
let centroid_value = centroids[i];
|
|
566
|
-
result.push(FontSizeCluster {
|
|
567
|
-
centroid: centroid_value,
|
|
568
|
-
members: clusters[i].clone(),
|
|
569
|
-
});
|
|
570
|
-
}
|
|
571
|
-
}
|
|
572
|
-
|
|
573
|
-
// Sort by centroid size in descending order (largest font = H1)
|
|
574
|
-
result.sort_by(|a, b| {
|
|
575
|
-
b.centroid
|
|
576
|
-
.partial_cmp(&a.centroid)
|
|
577
|
-
.expect("Failed to compare centroids during final sort")
|
|
578
|
-
});
|
|
579
|
-
|
|
580
|
-
Ok(result)
|
|
581
|
-
}
|
|
582
|
-
|
|
583
243
|
/// Extract characters with fonts from a PDF page.
|
|
584
244
|
///
|
|
585
245
|
/// Iterates through all characters on a page, extracting text, position,
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
//! PDF text hierarchy extraction using pdfium character positions.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides functions for extracting character information from PDFs,
|
|
4
|
+
//! preserving font size and position data for text hierarchy analysis.
|
|
5
|
+
//!
|
|
6
|
+
//! Note: Requires the "pdf" feature to be enabled.
|
|
7
|
+
|
|
8
|
+
mod bounding_box;
|
|
9
|
+
mod clustering;
|
|
10
|
+
mod extraction;
|
|
11
|
+
|
|
12
|
+
// Re-export all public types and functions for backward compatibility
|
|
13
|
+
pub use bounding_box::BoundingBox;
|
|
14
|
+
pub use clustering::{FontSizeCluster, cluster_font_sizes};
|
|
15
|
+
pub use extraction::{
|
|
16
|
+
CharData, HierarchyBlock, HierarchyLevel, KMeansResult, TextBlock, assign_hierarchy_levels,
|
|
17
|
+
assign_hierarchy_levels_from_clusters, extract_chars_with_fonts, merge_chars_into_blocks, should_trigger_ocr,
|
|
18
|
+
};
|