kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
//! Core types for document extraction.
|
|
2
|
+
|
|
3
|
+
// Module declarations
|
|
4
|
+
pub mod djot;
|
|
5
|
+
pub mod extraction;
|
|
6
|
+
pub mod formats;
|
|
7
|
+
pub mod metadata;
|
|
8
|
+
pub mod page;
|
|
9
|
+
pub mod serde_helpers;
|
|
10
|
+
pub mod tables;
|
|
11
|
+
|
|
12
|
+
// Re-export all types for backward compatibility
|
|
13
|
+
pub use djot::*;
|
|
14
|
+
pub use extraction::*;
|
|
15
|
+
pub use formats::*;
|
|
16
|
+
pub use metadata::*;
|
|
17
|
+
pub use page::*;
|
|
18
|
+
pub use tables::*;
|
|
19
|
+
|
|
20
|
+
#[cfg(test)]
|
|
21
|
+
mod tests {
|
|
22
|
+
use super::*;
|
|
23
|
+
use std::sync::Arc;
|
|
24
|
+
|
|
25
|
+
#[test]
|
|
26
|
+
fn test_metadata_serialization_with_format() {
|
|
27
|
+
let mut metadata = Metadata {
|
|
28
|
+
format: Some(FormatMetadata::Text(TextMetadata {
|
|
29
|
+
line_count: 1,
|
|
30
|
+
word_count: 2,
|
|
31
|
+
character_count: 13,
|
|
32
|
+
headers: None,
|
|
33
|
+
links: None,
|
|
34
|
+
code_blocks: None,
|
|
35
|
+
})),
|
|
36
|
+
..Default::default()
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
metadata
|
|
40
|
+
.additional
|
|
41
|
+
.insert("quality_score".to_string(), serde_json::json!(1.0));
|
|
42
|
+
|
|
43
|
+
let json = serde_json::to_value(&metadata).unwrap();
|
|
44
|
+
println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
|
|
45
|
+
|
|
46
|
+
assert!(
|
|
47
|
+
json.get("format_type").is_some(),
|
|
48
|
+
"format_type should be present in serialized JSON"
|
|
49
|
+
);
|
|
50
|
+
assert_eq!(json.get("format_type").unwrap(), "text");
|
|
51
|
+
|
|
52
|
+
assert_eq!(json.get("line_count").unwrap(), 1);
|
|
53
|
+
assert_eq!(json.get("word_count").unwrap(), 2);
|
|
54
|
+
assert_eq!(json.get("character_count").unwrap(), 13);
|
|
55
|
+
|
|
56
|
+
assert_eq!(json.get("quality_score").unwrap(), 1.0);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
#[test]
|
|
60
|
+
fn test_arc_table_serialization_format() {
|
|
61
|
+
let table = Table {
|
|
62
|
+
cells: vec![vec!["A".to_string(), "B".to_string()]],
|
|
63
|
+
markdown: "| A | B |\n|---|---|\n".to_string(),
|
|
64
|
+
page_number: 1,
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
let json = serde_json::to_value(&table).unwrap();
|
|
68
|
+
|
|
69
|
+
assert_eq!(json.get("cells").unwrap()[0][0], "A");
|
|
70
|
+
assert_eq!(json.get("markdown").unwrap(), "| A | B |\n|---|---|\n");
|
|
71
|
+
assert_eq!(json.get("page_number").unwrap(), 1);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
#[test]
|
|
75
|
+
fn test_arc_table_roundtrip() {
|
|
76
|
+
let original = Table {
|
|
77
|
+
cells: vec![
|
|
78
|
+
vec!["X".to_string(), "Y".to_string()],
|
|
79
|
+
vec!["1".to_string(), "2".to_string()],
|
|
80
|
+
],
|
|
81
|
+
markdown: "| X | Y |\n|---|---|\n| 1 | 2 |\n".to_string(),
|
|
82
|
+
page_number: 5,
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
let json = serde_json::to_string(&original).unwrap();
|
|
86
|
+
let deserialized: Table = serde_json::from_str(&json).unwrap();
|
|
87
|
+
|
|
88
|
+
assert_eq!(deserialized.cells, original.cells);
|
|
89
|
+
assert_eq!(deserialized.markdown, original.markdown);
|
|
90
|
+
assert_eq!(deserialized.page_number, original.page_number);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
#[test]
|
|
94
|
+
fn test_arc_sharing_preserved_before_serialization() {
|
|
95
|
+
let shared_table = Arc::new(Table {
|
|
96
|
+
cells: vec![vec!["shared".to_string()]],
|
|
97
|
+
markdown: "| shared |".to_string(),
|
|
98
|
+
page_number: 1,
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
let tables_before = [Arc::clone(&shared_table), Arc::clone(&shared_table)].to_vec();
|
|
102
|
+
assert_eq!(Arc::strong_count(&tables_before[0]), 3);
|
|
103
|
+
assert_eq!(Arc::strong_count(&tables_before[1]), 3);
|
|
104
|
+
assert!(Arc::ptr_eq(&tables_before[0], &tables_before[1]));
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
#[test]
|
|
108
|
+
fn test_vec_arc_table_serialization_format() {
|
|
109
|
+
let tables = vec![
|
|
110
|
+
Table {
|
|
111
|
+
cells: vec![vec!["A".to_string()]],
|
|
112
|
+
markdown: "| A |".to_string(),
|
|
113
|
+
page_number: 1,
|
|
114
|
+
},
|
|
115
|
+
Table {
|
|
116
|
+
cells: vec![vec!["B".to_string()]],
|
|
117
|
+
markdown: "| B |".to_string(),
|
|
118
|
+
page_number: 2,
|
|
119
|
+
},
|
|
120
|
+
];
|
|
121
|
+
|
|
122
|
+
let json = serde_json::to_string(&tables).unwrap();
|
|
123
|
+
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
|
|
124
|
+
|
|
125
|
+
assert!(parsed.is_array());
|
|
126
|
+
assert_eq!(parsed.as_array().unwrap().len(), 2);
|
|
127
|
+
assert_eq!(parsed[0]["cells"][0][0], "A");
|
|
128
|
+
assert_eq!(parsed[1]["cells"][0][0], "B");
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
#[test]
|
|
132
|
+
fn test_page_content_arc_tables_roundtrip() {
|
|
133
|
+
let page = PageContent {
|
|
134
|
+
page_number: 3,
|
|
135
|
+
content: "Page 3 content".to_string(),
|
|
136
|
+
tables: vec![
|
|
137
|
+
Arc::new(Table {
|
|
138
|
+
cells: vec![vec!["Table1".to_string()]],
|
|
139
|
+
markdown: "| Table1 |".to_string(),
|
|
140
|
+
page_number: 3,
|
|
141
|
+
}),
|
|
142
|
+
Arc::new(Table {
|
|
143
|
+
cells: vec![vec!["Table2".to_string()]],
|
|
144
|
+
markdown: "| Table2 |".to_string(),
|
|
145
|
+
page_number: 3,
|
|
146
|
+
}),
|
|
147
|
+
],
|
|
148
|
+
images: Vec::new(),
|
|
149
|
+
hierarchy: None,
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
let json = serde_json::to_string(&page).unwrap();
|
|
153
|
+
let deserialized: PageContent = serde_json::from_str(&json).unwrap();
|
|
154
|
+
|
|
155
|
+
assert_eq!(deserialized.page_number, 3);
|
|
156
|
+
assert_eq!(deserialized.content, "Page 3 content");
|
|
157
|
+
assert_eq!(deserialized.tables.len(), 2);
|
|
158
|
+
assert_eq!(deserialized.tables[0].cells[0][0], "Table1");
|
|
159
|
+
assert_eq!(deserialized.tables[1].cells[0][0], "Table2");
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
#[test]
|
|
163
|
+
fn test_page_content_arc_images_roundtrip() {
|
|
164
|
+
let image1 = Arc::new(ExtractedImage {
|
|
165
|
+
data: vec![0xFF, 0xD8, 0xFF],
|
|
166
|
+
format: "jpeg".to_string(),
|
|
167
|
+
image_index: 0,
|
|
168
|
+
page_number: Some(1),
|
|
169
|
+
width: Some(100),
|
|
170
|
+
height: Some(200),
|
|
171
|
+
colorspace: Some("RGB".to_string()),
|
|
172
|
+
bits_per_component: Some(8),
|
|
173
|
+
is_mask: false,
|
|
174
|
+
description: Some("Image 1".to_string()),
|
|
175
|
+
ocr_result: None,
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
let image2 = Arc::new(ExtractedImage {
|
|
179
|
+
data: vec![0x89, 0x50, 0x4E],
|
|
180
|
+
format: "png".to_string(),
|
|
181
|
+
image_index: 1,
|
|
182
|
+
page_number: Some(1),
|
|
183
|
+
width: Some(300),
|
|
184
|
+
height: Some(400),
|
|
185
|
+
colorspace: Some("RGBA".to_string()),
|
|
186
|
+
bits_per_component: Some(8),
|
|
187
|
+
is_mask: false,
|
|
188
|
+
description: Some("Image 2".to_string()),
|
|
189
|
+
ocr_result: None,
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
let page = PageContent {
|
|
193
|
+
page_number: 1,
|
|
194
|
+
content: "Page with images".to_string(),
|
|
195
|
+
tables: Vec::new(),
|
|
196
|
+
images: vec![image1, image2],
|
|
197
|
+
hierarchy: None,
|
|
198
|
+
};
|
|
199
|
+
|
|
200
|
+
let json = serde_json::to_string(&page).unwrap();
|
|
201
|
+
let deserialized: PageContent = serde_json::from_str(&json).unwrap();
|
|
202
|
+
|
|
203
|
+
assert_eq!(deserialized.images.len(), 2);
|
|
204
|
+
assert_eq!(deserialized.images[0].format, "jpeg");
|
|
205
|
+
assert_eq!(deserialized.images[0].width, Some(100));
|
|
206
|
+
assert_eq!(deserialized.images[1].format, "png");
|
|
207
|
+
assert_eq!(deserialized.images[1].height, Some(400));
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
#[test]
|
|
211
|
+
fn test_arc_sharing_loss_with_page_content() {
|
|
212
|
+
let shared_table = Arc::new(Table {
|
|
213
|
+
cells: vec![vec!["shared across pages".to_string()]],
|
|
214
|
+
markdown: "| shared across pages |".to_string(),
|
|
215
|
+
page_number: 0,
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
let page1 = PageContent {
|
|
219
|
+
page_number: 1,
|
|
220
|
+
content: "Page 1".to_string(),
|
|
221
|
+
tables: vec![Arc::clone(&shared_table)],
|
|
222
|
+
images: Vec::new(),
|
|
223
|
+
hierarchy: None,
|
|
224
|
+
};
|
|
225
|
+
|
|
226
|
+
let page2 = PageContent {
|
|
227
|
+
page_number: 2,
|
|
228
|
+
content: "Page 2".to_string(),
|
|
229
|
+
tables: vec![Arc::clone(&shared_table)],
|
|
230
|
+
images: Vec::new(),
|
|
231
|
+
hierarchy: None,
|
|
232
|
+
};
|
|
233
|
+
|
|
234
|
+
assert!(Arc::ptr_eq(&page1.tables[0], &page2.tables[0]));
|
|
235
|
+
|
|
236
|
+
let pages = vec![page1, page2];
|
|
237
|
+
let json = serde_json::to_string(&pages).unwrap();
|
|
238
|
+
let deserialized: Vec<PageContent> = serde_json::from_str(&json).unwrap();
|
|
239
|
+
|
|
240
|
+
assert_eq!(deserialized.len(), 2);
|
|
241
|
+
assert_eq!(deserialized[0].tables[0].cells, deserialized[1].tables[0].cells);
|
|
242
|
+
assert!(!Arc::ptr_eq(&deserialized[0].tables[0], &deserialized[1].tables[0]));
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
#[test]
|
|
246
|
+
fn test_empty_page_content_arcs() {
|
|
247
|
+
let page = PageContent {
|
|
248
|
+
page_number: 5,
|
|
249
|
+
content: "No tables or images".to_string(),
|
|
250
|
+
tables: Vec::new(),
|
|
251
|
+
images: Vec::new(),
|
|
252
|
+
hierarchy: None,
|
|
253
|
+
};
|
|
254
|
+
|
|
255
|
+
let json = serde_json::to_string(&page).unwrap();
|
|
256
|
+
let deserialized: PageContent = serde_json::from_str(&json).unwrap();
|
|
257
|
+
|
|
258
|
+
assert_eq!(deserialized.page_number, 5);
|
|
259
|
+
assert_eq!(deserialized.tables.len(), 0);
|
|
260
|
+
assert_eq!(deserialized.images.len(), 0);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
#[test]
|
|
264
|
+
fn test_serde_vec_arc_module_behavior() {
|
|
265
|
+
let table1 = Table {
|
|
266
|
+
cells: vec![vec!["A".to_string()]],
|
|
267
|
+
markdown: "| A |".to_string(),
|
|
268
|
+
page_number: 1,
|
|
269
|
+
};
|
|
270
|
+
|
|
271
|
+
let table2 = Table {
|
|
272
|
+
cells: vec![vec!["B".to_string()]],
|
|
273
|
+
markdown: "| B |".to_string(),
|
|
274
|
+
page_number: 2,
|
|
275
|
+
};
|
|
276
|
+
|
|
277
|
+
let json = serde_json::to_string(&vec![table1, table2]).unwrap();
|
|
278
|
+
assert!(json.contains("\"A\""));
|
|
279
|
+
assert!(json.contains("\"B\""));
|
|
280
|
+
}
|
|
281
|
+
}
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
//! Page structure types for documents.
|
|
2
|
+
//!
|
|
3
|
+
//! This module defines types for representing paginated document structures.
|
|
4
|
+
|
|
5
|
+
use serde::{Deserialize, Serialize};
|
|
6
|
+
use std::sync::Arc;
|
|
7
|
+
|
|
8
|
+
// Import serde helper and types from sibling modules
|
|
9
|
+
use super::extraction::ExtractedImage;
|
|
10
|
+
use super::serde_helpers::serde_vec_arc;
|
|
11
|
+
use super::tables::Table;
|
|
12
|
+
|
|
13
|
+
/// Unified page structure for documents.
|
|
14
|
+
///
|
|
15
|
+
/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
|
16
|
+
/// with character offset boundaries for chunk-to-page mapping.
|
|
17
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
18
|
+
pub struct PageStructure {
|
|
19
|
+
/// Total number of pages/slides/sheets
|
|
20
|
+
pub total_count: usize,
|
|
21
|
+
|
|
22
|
+
/// Type of paginated unit
|
|
23
|
+
pub unit_type: PageUnitType,
|
|
24
|
+
|
|
25
|
+
/// Character offset boundaries for each page
|
|
26
|
+
///
|
|
27
|
+
/// Maps character ranges in the extracted content to page numbers.
|
|
28
|
+
/// Used for chunk page range calculation.
|
|
29
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
30
|
+
pub boundaries: Option<Vec<PageBoundary>>,
|
|
31
|
+
|
|
32
|
+
/// Detailed per-page metadata (optional, only when needed)
|
|
33
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
34
|
+
pub pages: Option<Vec<PageInfo>>,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/// Type of paginated unit in a document.
|
|
38
|
+
///
|
|
39
|
+
/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
|
|
40
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
41
|
+
#[serde(rename_all = "snake_case")]
|
|
42
|
+
pub enum PageUnitType {
|
|
43
|
+
/// Standard document pages (PDF, DOCX, images)
|
|
44
|
+
Page,
|
|
45
|
+
/// Presentation slides (PPTX, ODP)
|
|
46
|
+
Slide,
|
|
47
|
+
/// Spreadsheet sheets (XLSX, ODS)
|
|
48
|
+
Sheet,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/// Byte offset boundary for a page.
|
|
52
|
+
///
|
|
53
|
+
/// Tracks where a specific page's content starts and ends in the main content string,
|
|
54
|
+
/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|
55
|
+
/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|
56
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
57
|
+
pub struct PageBoundary {
|
|
58
|
+
/// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
|
59
|
+
pub byte_start: usize,
|
|
60
|
+
/// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
|
|
61
|
+
pub byte_end: usize,
|
|
62
|
+
/// Page number (1-indexed)
|
|
63
|
+
pub page_number: usize,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/// Metadata for individual page/slide/sheet.
|
|
67
|
+
///
|
|
68
|
+
/// Captures per-page information including dimensions, content counts,
|
|
69
|
+
/// and visibility state (for presentations).
|
|
70
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
71
|
+
pub struct PageInfo {
|
|
72
|
+
/// Page number (1-indexed)
|
|
73
|
+
pub number: usize,
|
|
74
|
+
|
|
75
|
+
/// Page title (usually for presentations)
|
|
76
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
77
|
+
pub title: Option<String>,
|
|
78
|
+
|
|
79
|
+
/// Dimensions in points (PDF) or pixels (images): (width, height)
|
|
80
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
81
|
+
pub dimensions: Option<(f64, f64)>,
|
|
82
|
+
|
|
83
|
+
/// Number of images on this page
|
|
84
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
85
|
+
pub image_count: Option<usize>,
|
|
86
|
+
|
|
87
|
+
/// Number of tables on this page
|
|
88
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
89
|
+
pub table_count: Option<usize>,
|
|
90
|
+
|
|
91
|
+
/// Whether this page is hidden (e.g., in presentations)
|
|
92
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
93
|
+
pub hidden: Option<bool>,
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/// Content for a single page/slide.
|
|
97
|
+
///
|
|
98
|
+
/// When page extraction is enabled, documents are split into per-page content
|
|
99
|
+
/// with associated tables and images mapped to each page.
|
|
100
|
+
///
|
|
101
|
+
/// # Performance
|
|
102
|
+
///
|
|
103
|
+
/// Uses Arc-wrapped tables and images for memory efficiency:
|
|
104
|
+
/// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
|
|
105
|
+
/// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
|
|
106
|
+
/// - Maintains exact JSON compatibility via custom Serialize/Deserialize
|
|
107
|
+
///
|
|
108
|
+
/// This reduces memory overhead for documents with shared tables/images
|
|
109
|
+
/// by avoiding redundant copies during serialization.
|
|
110
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
111
|
+
pub struct PageContent {
|
|
112
|
+
/// Page number (1-indexed)
|
|
113
|
+
pub page_number: usize,
|
|
114
|
+
|
|
115
|
+
/// Text content for this page
|
|
116
|
+
pub content: String,
|
|
117
|
+
|
|
118
|
+
/// Tables found on this page (uses Arc for memory efficiency)
|
|
119
|
+
///
|
|
120
|
+
/// Serializes as Vec<Table> for JSON compatibility while maintaining
|
|
121
|
+
/// Arc semantics in-memory for zero-copy sharing.
|
|
122
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
|
|
123
|
+
pub tables: Vec<Arc<Table>>,
|
|
124
|
+
|
|
125
|
+
/// Images found on this page (uses Arc for memory efficiency)
|
|
126
|
+
///
|
|
127
|
+
/// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
|
|
128
|
+
/// Arc semantics in-memory for zero-copy sharing.
|
|
129
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
|
|
130
|
+
pub images: Vec<Arc<ExtractedImage>>,
|
|
131
|
+
|
|
132
|
+
/// Hierarchy information for the page (when hierarchy extraction is enabled)
|
|
133
|
+
///
|
|
134
|
+
/// Contains text hierarchy levels (H1-H6) extracted from the page content.
|
|
135
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
136
|
+
pub hierarchy: Option<PageHierarchy>,
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/// Page hierarchy structure containing heading levels and block information.
|
|
140
|
+
///
|
|
141
|
+
/// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
|
|
142
|
+
/// blocks with heading levels (H1-H6) for semantic document structure.
|
|
143
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
144
|
+
pub struct PageHierarchy {
|
|
145
|
+
/// Number of hierarchy blocks on this page
|
|
146
|
+
pub block_count: usize,
|
|
147
|
+
|
|
148
|
+
/// Hierarchical blocks with heading levels
|
|
149
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
150
|
+
pub blocks: Vec<HierarchicalBlock>,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/// A text block with hierarchy level assignment.
|
|
154
|
+
///
|
|
155
|
+
/// Represents a block of text with semantic heading information extracted from
|
|
156
|
+
/// font size clustering and hierarchical analysis.
|
|
157
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
158
|
+
pub struct HierarchicalBlock {
|
|
159
|
+
/// The text content of this block
|
|
160
|
+
pub text: String,
|
|
161
|
+
|
|
162
|
+
/// The font size of the text in this block
|
|
163
|
+
pub font_size: f32,
|
|
164
|
+
|
|
165
|
+
/// The hierarchy level of this block (H1-H6 or Body)
|
|
166
|
+
///
|
|
167
|
+
/// Levels correspond to HTML heading tags:
|
|
168
|
+
/// - "h1": Top-level heading
|
|
169
|
+
/// - "h2": Secondary heading
|
|
170
|
+
/// - "h3": Tertiary heading
|
|
171
|
+
/// - "h4": Quaternary heading
|
|
172
|
+
/// - "h5": Quinary heading
|
|
173
|
+
/// - "h6": Senary heading
|
|
174
|
+
/// - "body": Body text (no heading level)
|
|
175
|
+
pub level: String,
|
|
176
|
+
|
|
177
|
+
/// Bounding box information for the block
|
|
178
|
+
///
|
|
179
|
+
/// Contains coordinates as (left, top, right, bottom) in PDF units.
|
|
180
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
181
|
+
pub bbox: Option<(f32, f32, f32, f32)>,
|
|
182
|
+
}
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
//! Custom serde serialization helpers for Arc<T> and Vec<Arc<T>>.
|
|
2
|
+
|
|
3
|
+
/// Module providing transparent serde support for Arc<T>.
|
|
4
|
+
///
|
|
5
|
+
/// Allows Arc-wrapped types to serialize/deserialize as if unwrapped,
|
|
6
|
+
/// maintaining exact JSON format while preserving memory efficiency benefits.
|
|
7
|
+
///
|
|
8
|
+
/// # Arc Sharing Semantics
|
|
9
|
+
///
|
|
10
|
+
/// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
|
|
11
|
+
/// When deserializing, each Arc is independently created with `Arc::new()`.
|
|
12
|
+
/// This means that if two Arcs referenced the same data before serialization,
|
|
13
|
+
/// they will be separate Arcs after deserialization.
|
|
14
|
+
///
|
|
15
|
+
/// Example:
|
|
16
|
+
/// ```ignore
|
|
17
|
+
/// let shared = Arc::new(Table { /* ... */ });
|
|
18
|
+
/// let tables = vec![Arc::clone(&shared), Arc::clone(&shared)];
|
|
19
|
+
/// // Both in-memory Arcs point to the same Table
|
|
20
|
+
///
|
|
21
|
+
/// let json = serde_json::to_string(&tables)?;
|
|
22
|
+
/// let deserialized: Vec<Arc<Table>> = serde_json::from_str(&json)?;
|
|
23
|
+
/// // deserialized[0] and deserialized[1] are now independent Arcs,
|
|
24
|
+
/// // even though they contain identical data
|
|
25
|
+
/// ```
|
|
26
|
+
///
|
|
27
|
+
/// This design choice maintains:
|
|
28
|
+
/// - Exact JSON format compatibility (no sharing metadata in JSON)
|
|
29
|
+
/// - Predictable deserialization behavior
|
|
30
|
+
/// - Zero additional serialization overhead
|
|
31
|
+
///
|
|
32
|
+
/// If in-memory sharing is required, callers must implement custom sharing logic
|
|
33
|
+
/// or use a different data structure (like a HashMap of deduplicated values).
|
|
34
|
+
#[allow(dead_code)]
|
|
35
|
+
pub mod serde_arc {
|
|
36
|
+
use serde::{Deserialize, Deserializer, Serializer};
|
|
37
|
+
use std::sync::Arc;
|
|
38
|
+
|
|
39
|
+
/// Serialize an Arc<T> by serializing the inner value directly.
|
|
40
|
+
///
|
|
41
|
+
/// This makes Arc<T> serialize identically to T, maintaining API compatibility.
|
|
42
|
+
/// The outer Arc wrapper is transparent during serialization.
|
|
43
|
+
pub fn serialize<S, T>(arc_value: &Arc<T>, serializer: S) -> Result<S::Ok, S::Error>
|
|
44
|
+
where
|
|
45
|
+
S: Serializer,
|
|
46
|
+
T: serde::Serialize,
|
|
47
|
+
{
|
|
48
|
+
(**arc_value).serialize(serializer)
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/// Deserialize a T and wrap it in Arc.
|
|
52
|
+
///
|
|
53
|
+
/// This makes Arc<T> deserialize from the same format as T.
|
|
54
|
+
/// Each Arc is independently created during deserialization;
|
|
55
|
+
/// Arc sharing from before serialization is NOT preserved.
|
|
56
|
+
pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Arc<T>, D::Error>
|
|
57
|
+
where
|
|
58
|
+
D: Deserializer<'de>,
|
|
59
|
+
T: Deserialize<'de>,
|
|
60
|
+
{
|
|
61
|
+
T::deserialize(deserializer).map(Arc::new)
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/// Module for serializing Vec<Arc<T>> with transparent Arc handling.
|
|
66
|
+
///
|
|
67
|
+
/// Serializes a Vec<Arc<T>> as Vec<T> for compatibility, while preserving
|
|
68
|
+
/// Arc semantics for memory efficiency.
|
|
69
|
+
///
|
|
70
|
+
/// # Arc Sharing Semantics
|
|
71
|
+
///
|
|
72
|
+
/// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
|
|
73
|
+
/// When deserializing, each element's Arc is independently created with `Arc::new()`.
|
|
74
|
+
/// This is important for `PageContent` where tables/images may be shared across pages.
|
|
75
|
+
///
|
|
76
|
+
/// Example with shared tables:
|
|
77
|
+
/// ```ignore
|
|
78
|
+
/// let shared_table = Arc::new(Table { /* ... */ });
|
|
79
|
+
/// let page_contents = vec![
|
|
80
|
+
/// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
|
|
81
|
+
/// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
|
|
82
|
+
/// ];
|
|
83
|
+
/// // In-memory: both pages' tables point to the same Arc
|
|
84
|
+
///
|
|
85
|
+
/// let json = serde_json::to_string(&page_contents)?;
|
|
86
|
+
/// let deserialized = serde_json::from_str::<Vec<PageContent>>(&json)?;
|
|
87
|
+
/// // After deserialization: each page has independent Arc instances,
|
|
88
|
+
/// // even though the table data is identical
|
|
89
|
+
/// ```
|
|
90
|
+
///
|
|
91
|
+
/// Design rationale:
|
|
92
|
+
/// - JSON has no mechanism to represent shared references
|
|
93
|
+
/// - Preserving sharing would require complex metadata and deduplication
|
|
94
|
+
/// - Current approach is simple, predictable, and maintains compatibility
|
|
95
|
+
/// - In-memory sharing (via Arc) is an implementation detail for the Rust side
|
|
96
|
+
///
|
|
97
|
+
/// If in-memory sharing is required after deserialization, implement custom
|
|
98
|
+
/// deduplication logic using hashing or content comparison.
|
|
99
|
+
pub mod serde_vec_arc {
|
|
100
|
+
use serde::{Deserialize, Deserializer, Serializer};
|
|
101
|
+
use std::sync::Arc;
|
|
102
|
+
|
|
103
|
+
/// Serialize Vec<Arc<T>> by serializing each T directly.
|
|
104
|
+
///
|
|
105
|
+
/// Each element is unwrapped from its Arc and serialized independently.
|
|
106
|
+
/// No sharing metadata is included in the serialized output.
|
|
107
|
+
pub fn serialize<S, T>(vec: &[Arc<T>], serializer: S) -> Result<S::Ok, S::Error>
|
|
108
|
+
where
|
|
109
|
+
S: Serializer,
|
|
110
|
+
T: serde::Serialize,
|
|
111
|
+
{
|
|
112
|
+
use serde::ser::SerializeSeq;
|
|
113
|
+
let mut seq = serializer.serialize_seq(Some(vec.len()))?;
|
|
114
|
+
for arc_item in vec {
|
|
115
|
+
seq.serialize_element(&**arc_item)?;
|
|
116
|
+
}
|
|
117
|
+
seq.end()
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/// Deserialize Vec<T> and wrap each element in Arc.
|
|
121
|
+
///
|
|
122
|
+
/// Each element is independently wrapped in a new Arc.
|
|
123
|
+
/// Sharing relationships from before serialization are lost.
|
|
124
|
+
pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Vec<Arc<T>>, D::Error>
|
|
125
|
+
where
|
|
126
|
+
D: Deserializer<'de>,
|
|
127
|
+
T: Deserialize<'de>,
|
|
128
|
+
{
|
|
129
|
+
let vec: Vec<T> = Deserialize::deserialize(deserializer)?;
|
|
130
|
+
Ok(vec.into_iter().map(Arc::new).collect())
|
|
131
|
+
}
|
|
132
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
//! Table-related types for document extraction.
|
|
2
|
+
|
|
3
|
+
use serde::{Deserialize, Serialize};
|
|
4
|
+
|
|
5
|
+
/// Extracted table structure.
|
|
6
|
+
///
|
|
7
|
+
/// Represents a table detected and extracted from a document (PDF, image, etc.).
|
|
8
|
+
/// Tables are converted to both structured cell data and Markdown format.
|
|
9
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
10
|
+
pub struct Table {
|
|
11
|
+
/// Table cells as a 2D vector (rows × columns)
|
|
12
|
+
pub cells: Vec<Vec<String>>,
|
|
13
|
+
/// Markdown representation of the table
|
|
14
|
+
pub markdown: String,
|
|
15
|
+
/// Page number where the table was found (1-indexed)
|
|
16
|
+
pub page_number: usize,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/// Individual table cell with content and optional styling.
|
|
20
|
+
///
|
|
21
|
+
/// Future extension point for rich table support with cell-level metadata.
|
|
22
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
23
|
+
pub struct TableCell {
|
|
24
|
+
/// Cell content as text
|
|
25
|
+
pub content: String,
|
|
26
|
+
/// Row span (number of rows this cell spans)
|
|
27
|
+
#[serde(default = "default_span")]
|
|
28
|
+
pub row_span: usize,
|
|
29
|
+
/// Column span (number of columns this cell spans)
|
|
30
|
+
#[serde(default = "default_span")]
|
|
31
|
+
pub col_span: usize,
|
|
32
|
+
/// Whether this is a header cell
|
|
33
|
+
#[serde(default)]
|
|
34
|
+
pub is_header: bool,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
fn default_span() -> usize {
|
|
38
|
+
1
|
|
39
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
//! Quality heuristics and text analysis
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides heuristic checks for text quality, including
|
|
4
|
+
//! structure analysis and line-level checks.
|
|
5
|
+
|
|
6
|
+
use super::patterns::*;
|
|
7
|
+
|
|
8
|
+
// ============================================================================
|
|
9
|
+
// Structure Thresholds
|
|
10
|
+
// ============================================================================
|
|
11
|
+
|
|
12
|
+
const MIN_SENTENCE_WORDS: f64 = 10.0;
|
|
13
|
+
const MAX_SENTENCE_WORDS: f64 = 30.0;
|
|
14
|
+
const MIN_PARAGRAPH_WORDS: f64 = 50.0;
|
|
15
|
+
const MAX_PARAGRAPH_WORDS: f64 = 300.0;
|
|
16
|
+
|
|
17
|
+
// ============================================================================
|
|
18
|
+
// Structure Analysis
|
|
19
|
+
// ============================================================================
|
|
20
|
+
|
|
21
|
+
/// Calculate bonus based on text structure quality
|
|
22
|
+
#[inline]
|
|
23
|
+
pub(crate) fn calculate_structure_bonus(text: &str) -> f64 {
|
|
24
|
+
if text.is_empty() {
|
|
25
|
+
return 0.0;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
let sentence_count = SENTENCE_DETECT.find_iter(text).count() as f64;
|
|
29
|
+
let paragraph_count = text.matches("\n\n").count() as f64 + 1.0;
|
|
30
|
+
let words = text.split_whitespace().count() as f64;
|
|
31
|
+
|
|
32
|
+
if words == 0.0 {
|
|
33
|
+
return 0.0;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
let avg_words_per_sentence = words / sentence_count.max(1.0);
|
|
37
|
+
let avg_words_per_paragraph = words / paragraph_count.max(1.0);
|
|
38
|
+
|
|
39
|
+
let mut structure_score: f64 = 0.0;
|
|
40
|
+
|
|
41
|
+
if (MIN_SENTENCE_WORDS..=MAX_SENTENCE_WORDS).contains(&avg_words_per_sentence) {
|
|
42
|
+
structure_score += 0.3;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (MIN_PARAGRAPH_WORDS..=MAX_PARAGRAPH_WORDS).contains(&avg_words_per_paragraph) {
|
|
46
|
+
structure_score += 0.3;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if paragraph_count > 1.0 {
|
|
50
|
+
structure_score += 0.2;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if PUNCTUATION_DETECT.is_match(text) {
|
|
54
|
+
structure_score += 0.2;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
structure_score.min(1.0)
|
|
58
|
+
}
|