kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
//! ExtractionResult conversion to Ruby values
|
|
2
|
+
//!
|
|
3
|
+
//! Handles conversion of Kreuzberg ExtractionResult to Ruby Hash,
|
|
4
|
+
//! including complex nested structures like chunks, images, tables, and elements.
|
|
5
|
+
|
|
6
|
+
use crate::error_handling::runtime_error;
|
|
7
|
+
use crate::helpers::{json_value_to_ruby, set_hash_entry};
|
|
8
|
+
|
|
9
|
+
use kreuzberg::ExtractionResult as RustExtractionResult;
|
|
10
|
+
use magnus::{Error, RHash, Ruby, IntoValue};
|
|
11
|
+
use magnus::value::ReprValue;
|
|
12
|
+
|
|
13
|
+
/// Convert Kreuzberg ExtractionResult to Ruby Hash
|
|
14
|
+
///
|
|
15
|
+
/// Converts the Rust extraction result into a Ruby hash with all fields including:
|
|
16
|
+
/// - content, mime_type, metadata
|
|
17
|
+
/// - tables (with cells and markdown)
|
|
18
|
+
/// - detected_languages
|
|
19
|
+
/// - chunks (with embeddings)
|
|
20
|
+
/// - images (including OCR results)
|
|
21
|
+
/// - pages (with per-page content)
|
|
22
|
+
/// - elements (for element-based format)
|
|
23
|
+
pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Result<RHash, Error> {
|
|
24
|
+
let hash = ruby.hash_new();
|
|
25
|
+
|
|
26
|
+
// Set content and MIME type
|
|
27
|
+
let content_value = ruby.str_new(result.content.as_str()).into_value_with(ruby);
|
|
28
|
+
set_hash_entry(ruby, &hash, "content", content_value)?;
|
|
29
|
+
|
|
30
|
+
let mime_value = ruby.str_new(result.mime_type.as_str()).into_value_with(ruby);
|
|
31
|
+
set_hash_entry(ruby, &hash, "mime_type", mime_value)?;
|
|
32
|
+
|
|
33
|
+
// Set metadata both as JSON string and parsed hash
|
|
34
|
+
let metadata_json = serde_json::to_string(&result.metadata)
|
|
35
|
+
.map_err(|e| runtime_error(format!("Failed to serialize metadata: {}", e)))?;
|
|
36
|
+
let metadata_json_value = ruby.str_new(&metadata_json).into_value_with(ruby);
|
|
37
|
+
set_hash_entry(ruby, &hash, "metadata_json", metadata_json_value)?;
|
|
38
|
+
let metadata_value = serde_json::to_value(&result.metadata)
|
|
39
|
+
.map_err(|e| runtime_error(format!("Failed to serialize metadata: {}", e)))?;
|
|
40
|
+
let metadata_hash = json_value_to_ruby(ruby, &metadata_value)?;
|
|
41
|
+
set_hash_entry(ruby, &hash, "metadata", metadata_hash)?;
|
|
42
|
+
|
|
43
|
+
// Convert tables
|
|
44
|
+
let tables_array = ruby.ary_new();
|
|
45
|
+
for table in result.tables {
|
|
46
|
+
let table_hash = ruby.hash_new();
|
|
47
|
+
|
|
48
|
+
let cells_array = ruby.ary_new();
|
|
49
|
+
for row in table.cells {
|
|
50
|
+
let row_array = ruby.ary_from_vec(row);
|
|
51
|
+
cells_array.push(row_array)?;
|
|
52
|
+
}
|
|
53
|
+
table_hash.aset("cells", cells_array)?;
|
|
54
|
+
table_hash.aset("markdown", table.markdown)?;
|
|
55
|
+
table_hash.aset("page_number", table.page_number)?;
|
|
56
|
+
|
|
57
|
+
tables_array.push(table_hash)?;
|
|
58
|
+
}
|
|
59
|
+
let tables_value = tables_array.into_value_with(ruby);
|
|
60
|
+
set_hash_entry(ruby, &hash, "tables", tables_value)?;
|
|
61
|
+
|
|
62
|
+
// Convert detected languages
|
|
63
|
+
if let Some(langs) = result.detected_languages {
|
|
64
|
+
let langs_array = ruby.ary_from_vec(langs);
|
|
65
|
+
let langs_value = langs_array.into_value_with(ruby);
|
|
66
|
+
set_hash_entry(ruby, &hash, "detected_languages", langs_value)?;
|
|
67
|
+
} else {
|
|
68
|
+
set_hash_entry(ruby, &hash, "detected_languages", ruby.qnil().as_value())?;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Convert chunks
|
|
72
|
+
if let Some(chunks) = result.chunks {
|
|
73
|
+
let chunks_array = ruby.ary_new();
|
|
74
|
+
for chunk in chunks {
|
|
75
|
+
let chunk_hash = ruby.hash_new();
|
|
76
|
+
chunk_hash.aset("content", chunk.content)?;
|
|
77
|
+
chunk_hash.aset("byte_start", chunk.metadata.byte_start)?;
|
|
78
|
+
chunk_hash.aset("byte_end", chunk.metadata.byte_end)?;
|
|
79
|
+
if let Some(token_count) = chunk.metadata.token_count {
|
|
80
|
+
chunk_hash.aset("token_count", token_count)?;
|
|
81
|
+
} else {
|
|
82
|
+
chunk_hash.aset("token_count", ruby.qnil().as_value())?;
|
|
83
|
+
}
|
|
84
|
+
chunk_hash.aset("chunk_index", chunk.metadata.chunk_index)?;
|
|
85
|
+
chunk_hash.aset("total_chunks", chunk.metadata.total_chunks)?;
|
|
86
|
+
if let Some(first_page) = chunk.metadata.first_page {
|
|
87
|
+
chunk_hash.aset("first_page", first_page as i64)?;
|
|
88
|
+
} else {
|
|
89
|
+
chunk_hash.aset("first_page", ruby.qnil().as_value())?;
|
|
90
|
+
}
|
|
91
|
+
if let Some(last_page) = chunk.metadata.last_page {
|
|
92
|
+
chunk_hash.aset("last_page", last_page as i64)?;
|
|
93
|
+
} else {
|
|
94
|
+
chunk_hash.aset("last_page", ruby.qnil().as_value())?;
|
|
95
|
+
}
|
|
96
|
+
if let Some(embedding) = chunk.embedding {
|
|
97
|
+
let embedding_array = ruby.ary_new();
|
|
98
|
+
for value in embedding {
|
|
99
|
+
embedding_array.push(ruby.float_from_f64(value as f64).into_value_with(ruby))?;
|
|
100
|
+
}
|
|
101
|
+
chunk_hash.aset("embedding", embedding_array)?;
|
|
102
|
+
} else {
|
|
103
|
+
chunk_hash.aset("embedding", ruby.qnil().as_value())?;
|
|
104
|
+
}
|
|
105
|
+
chunks_array.push(chunk_hash)?;
|
|
106
|
+
}
|
|
107
|
+
let chunks_value = chunks_array.into_value_with(ruby);
|
|
108
|
+
set_hash_entry(ruby, &hash, "chunks", chunks_value)?;
|
|
109
|
+
} else {
|
|
110
|
+
set_hash_entry(ruby, &hash, "chunks", ruby.qnil().as_value())?;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Convert images
|
|
114
|
+
if let Some(images) = result.images {
|
|
115
|
+
let images_array = ruby.ary_new();
|
|
116
|
+
for image in images {
|
|
117
|
+
let image_hash = ruby.hash_new();
|
|
118
|
+
let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
|
|
119
|
+
image_hash.aset("data", data_value)?;
|
|
120
|
+
image_hash.aset("format", image.format)?;
|
|
121
|
+
image_hash.aset("image_index", image.image_index as i64)?;
|
|
122
|
+
if let Some(page) = image.page_number {
|
|
123
|
+
image_hash.aset("page_number", page as i64)?;
|
|
124
|
+
} else {
|
|
125
|
+
image_hash.aset("page_number", ruby.qnil().as_value())?;
|
|
126
|
+
}
|
|
127
|
+
if let Some(width) = image.width {
|
|
128
|
+
image_hash.aset("width", width as i64)?;
|
|
129
|
+
} else {
|
|
130
|
+
image_hash.aset("width", ruby.qnil().as_value())?;
|
|
131
|
+
}
|
|
132
|
+
if let Some(height) = image.height {
|
|
133
|
+
image_hash.aset("height", height as i64)?;
|
|
134
|
+
} else {
|
|
135
|
+
image_hash.aset("height", ruby.qnil().as_value())?;
|
|
136
|
+
}
|
|
137
|
+
if let Some(colorspace) = image.colorspace {
|
|
138
|
+
image_hash.aset("colorspace", colorspace)?;
|
|
139
|
+
} else {
|
|
140
|
+
image_hash.aset("colorspace", ruby.qnil().as_value())?;
|
|
141
|
+
}
|
|
142
|
+
if let Some(bits) = image.bits_per_component {
|
|
143
|
+
image_hash.aset("bits_per_component", bits as i64)?;
|
|
144
|
+
} else {
|
|
145
|
+
image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
|
|
146
|
+
}
|
|
147
|
+
image_hash.aset(
|
|
148
|
+
"is_mask",
|
|
149
|
+
if image.is_mask {
|
|
150
|
+
ruby.qtrue().as_value()
|
|
151
|
+
} else {
|
|
152
|
+
ruby.qfalse().as_value()
|
|
153
|
+
},
|
|
154
|
+
)?;
|
|
155
|
+
if let Some(description) = image.description {
|
|
156
|
+
image_hash.aset("description", description)?;
|
|
157
|
+
} else {
|
|
158
|
+
image_hash.aset("description", ruby.qnil().as_value())?;
|
|
159
|
+
}
|
|
160
|
+
if let Some(ocr_result) = image.ocr_result {
|
|
161
|
+
let nested = extraction_result_to_ruby(ruby, *ocr_result)?;
|
|
162
|
+
image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
|
|
163
|
+
} else {
|
|
164
|
+
image_hash.aset("ocr_result", ruby.qnil().as_value())?;
|
|
165
|
+
}
|
|
166
|
+
images_array.push(image_hash)?;
|
|
167
|
+
}
|
|
168
|
+
set_hash_entry(ruby, &hash, "images", images_array.into_value_with(ruby))?;
|
|
169
|
+
} else {
|
|
170
|
+
set_hash_entry(ruby, &hash, "images", ruby.qnil().as_value())?;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Convert pages
|
|
174
|
+
if let Some(page_content_list) = result.pages {
|
|
175
|
+
let pages_array = ruby.ary_new();
|
|
176
|
+
for page_content in page_content_list {
|
|
177
|
+
let page_hash = ruby.hash_new();
|
|
178
|
+
page_hash.aset("page_number", page_content.page_number as i64)?;
|
|
179
|
+
page_hash.aset("content", page_content.content)?;
|
|
180
|
+
|
|
181
|
+
let tables_array = ruby.ary_new();
|
|
182
|
+
for table in page_content.tables {
|
|
183
|
+
let table_hash = ruby.hash_new();
|
|
184
|
+
|
|
185
|
+
let cells_array = ruby.ary_new();
|
|
186
|
+
for row in table.cells.clone() {
|
|
187
|
+
let row_array = ruby.ary_from_vec(row);
|
|
188
|
+
cells_array.push(row_array)?;
|
|
189
|
+
}
|
|
190
|
+
table_hash.aset("cells", cells_array)?;
|
|
191
|
+
table_hash.aset("markdown", table.markdown.clone())?;
|
|
192
|
+
table_hash.aset("page_number", table.page_number as i64)?;
|
|
193
|
+
|
|
194
|
+
tables_array.push(table_hash)?;
|
|
195
|
+
}
|
|
196
|
+
page_hash.aset("tables", tables_array)?;
|
|
197
|
+
|
|
198
|
+
let images_array = ruby.ary_new();
|
|
199
|
+
for image in page_content.images {
|
|
200
|
+
let image_hash = ruby.hash_new();
|
|
201
|
+
let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
|
|
202
|
+
image_hash.aset("data", data_value)?;
|
|
203
|
+
image_hash.aset("format", image.format.clone())?;
|
|
204
|
+
image_hash.aset("image_index", image.image_index as i64)?;
|
|
205
|
+
if let Some(page) = image.page_number {
|
|
206
|
+
image_hash.aset("page_number", page as i64)?;
|
|
207
|
+
} else {
|
|
208
|
+
image_hash.aset("page_number", ruby.qnil().as_value())?;
|
|
209
|
+
}
|
|
210
|
+
if let Some(width) = image.width {
|
|
211
|
+
image_hash.aset("width", width as i64)?;
|
|
212
|
+
} else {
|
|
213
|
+
image_hash.aset("width", ruby.qnil().as_value())?;
|
|
214
|
+
}
|
|
215
|
+
if let Some(height) = image.height {
|
|
216
|
+
image_hash.aset("height", height as i64)?;
|
|
217
|
+
} else {
|
|
218
|
+
image_hash.aset("height", ruby.qnil().as_value())?;
|
|
219
|
+
}
|
|
220
|
+
if let Some(colorspace) = &image.colorspace {
|
|
221
|
+
image_hash.aset("colorspace", colorspace.clone())?;
|
|
222
|
+
} else {
|
|
223
|
+
image_hash.aset("colorspace", ruby.qnil().as_value())?;
|
|
224
|
+
}
|
|
225
|
+
if let Some(bits) = image.bits_per_component {
|
|
226
|
+
image_hash.aset("bits_per_component", bits as i64)?;
|
|
227
|
+
} else {
|
|
228
|
+
image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
|
|
229
|
+
}
|
|
230
|
+
image_hash.aset(
|
|
231
|
+
"is_mask",
|
|
232
|
+
if image.is_mask {
|
|
233
|
+
ruby.qtrue().as_value()
|
|
234
|
+
} else {
|
|
235
|
+
ruby.qfalse().as_value()
|
|
236
|
+
},
|
|
237
|
+
)?;
|
|
238
|
+
if let Some(description) = &image.description {
|
|
239
|
+
image_hash.aset("description", description.clone())?;
|
|
240
|
+
} else {
|
|
241
|
+
image_hash.aset("description", ruby.qnil().as_value())?;
|
|
242
|
+
}
|
|
243
|
+
if let Some(ocr_result) = &image.ocr_result {
|
|
244
|
+
let nested = extraction_result_to_ruby(ruby, (**ocr_result).clone())?;
|
|
245
|
+
image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
|
|
246
|
+
} else {
|
|
247
|
+
image_hash.aset("ocr_result", ruby.qnil().as_value())?;
|
|
248
|
+
}
|
|
249
|
+
images_array.push(image_hash)?;
|
|
250
|
+
}
|
|
251
|
+
page_hash.aset("images", images_array)?;
|
|
252
|
+
|
|
253
|
+
pages_array.push(page_hash)?;
|
|
254
|
+
}
|
|
255
|
+
set_hash_entry(ruby, &hash, "pages", pages_array.into_value_with(ruby))?;
|
|
256
|
+
} else {
|
|
257
|
+
set_hash_entry(ruby, &hash, "pages", ruby.qnil().as_value())?;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Convert elements (element-based format)
|
|
261
|
+
if let Some(elements_list) = result.elements {
|
|
262
|
+
let elements_array = ruby.ary_new();
|
|
263
|
+
for element in elements_list {
|
|
264
|
+
let element_hash = ruby.hash_new();
|
|
265
|
+
element_hash.aset("element_id", element.element_id.as_ref())?;
|
|
266
|
+
|
|
267
|
+
// Convert ElementType to snake_case string
|
|
268
|
+
use kreuzberg::types::ElementType as ET;
|
|
269
|
+
let element_type_str = match element.element_type {
|
|
270
|
+
ET::Title => "title",
|
|
271
|
+
ET::NarrativeText => "narrative_text",
|
|
272
|
+
ET::Heading => "heading",
|
|
273
|
+
ET::ListItem => "list_item",
|
|
274
|
+
ET::Table => "table",
|
|
275
|
+
ET::Image => "image",
|
|
276
|
+
ET::PageBreak => "page_break",
|
|
277
|
+
ET::CodeBlock => "code_block",
|
|
278
|
+
ET::BlockQuote => "block_quote",
|
|
279
|
+
ET::Footer => "footer",
|
|
280
|
+
ET::Header => "header",
|
|
281
|
+
};
|
|
282
|
+
element_hash.aset("element_type", element_type_str)?;
|
|
283
|
+
element_hash.aset("text", element.text)?;
|
|
284
|
+
|
|
285
|
+
let metadata_hash = ruby.hash_new();
|
|
286
|
+
if let Some(page_num) = element.metadata.page_number {
|
|
287
|
+
metadata_hash.aset("page_number", page_num as i64)?;
|
|
288
|
+
} else {
|
|
289
|
+
metadata_hash.aset("page_number", ruby.qnil().as_value())?;
|
|
290
|
+
}
|
|
291
|
+
if let Some(filename) = &element.metadata.filename {
|
|
292
|
+
metadata_hash.aset("filename", filename.as_str())?;
|
|
293
|
+
} else {
|
|
294
|
+
metadata_hash.aset("filename", ruby.qnil().as_value())?;
|
|
295
|
+
}
|
|
296
|
+
if let Some(coords) = element.metadata.coordinates {
|
|
297
|
+
let coords_hash = ruby.hash_new();
|
|
298
|
+
coords_hash.aset("x0", coords.x0)?;
|
|
299
|
+
coords_hash.aset("y0", coords.y0)?;
|
|
300
|
+
coords_hash.aset("x1", coords.x1)?;
|
|
301
|
+
coords_hash.aset("y1", coords.y1)?;
|
|
302
|
+
metadata_hash.aset("coordinates", coords_hash)?;
|
|
303
|
+
} else {
|
|
304
|
+
metadata_hash.aset("coordinates", ruby.qnil().as_value())?;
|
|
305
|
+
}
|
|
306
|
+
if let Some(elem_idx) = element.metadata.element_index {
|
|
307
|
+
metadata_hash.aset("element_index", elem_idx as i64)?;
|
|
308
|
+
} else {
|
|
309
|
+
metadata_hash.aset("element_index", ruby.qnil().as_value())?;
|
|
310
|
+
}
|
|
311
|
+
let additional_hash = ruby.hash_new();
|
|
312
|
+
for (key, value) in &element.metadata.additional {
|
|
313
|
+
additional_hash.aset(key.as_str(), value.as_str())?;
|
|
314
|
+
}
|
|
315
|
+
metadata_hash.aset("additional", additional_hash)?;
|
|
316
|
+
|
|
317
|
+
element_hash.aset("metadata", metadata_hash)?;
|
|
318
|
+
elements_array.push(element_hash)?;
|
|
319
|
+
}
|
|
320
|
+
set_hash_entry(ruby, &hash, "elements", elements_array.into_value_with(ruby))?;
|
|
321
|
+
} else {
|
|
322
|
+
set_hash_entry(ruby, &hash, "elements", ruby.qnil().as_value())?;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
Ok(hash)
|
|
326
|
+
}
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -905,6 +905,72 @@ module Kreuzberg
|
|
|
905
905
|
self
|
|
906
906
|
end
|
|
907
907
|
|
|
908
|
+
# Set a configuration field using hash-like syntax
|
|
909
|
+
#
|
|
910
|
+
# @param key [Symbol, String] Field name to set
|
|
911
|
+
# @param value [Object] Value to set
|
|
912
|
+
# @return [Object] The value that was set
|
|
913
|
+
#
|
|
914
|
+
# @example
|
|
915
|
+
# config = Extraction.new(use_cache: true)
|
|
916
|
+
# config[:use_cache] = false
|
|
917
|
+
# config[:force_ocr] = true
|
|
918
|
+
#
|
|
919
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
920
|
+
def []=(key, value)
|
|
921
|
+
key_sym = key.to_sym
|
|
922
|
+
case key_sym
|
|
923
|
+
when :use_cache
|
|
924
|
+
@use_cache = value ? true : false
|
|
925
|
+
when :enable_quality_processing
|
|
926
|
+
@enable_quality_processing = value ? true : false
|
|
927
|
+
when :force_ocr
|
|
928
|
+
@force_ocr = value ? true : false
|
|
929
|
+
when :ocr
|
|
930
|
+
@ocr = normalize_config(value, OCR)
|
|
931
|
+
when :chunking
|
|
932
|
+
@chunking = normalize_config(value, Chunking)
|
|
933
|
+
when :language_detection
|
|
934
|
+
@language_detection = normalize_config(value, LanguageDetection)
|
|
935
|
+
when :pdf_options
|
|
936
|
+
@pdf_options = normalize_config(value, PDF)
|
|
937
|
+
when :image_extraction
|
|
938
|
+
@image_extraction = normalize_config(value, ImageExtraction)
|
|
939
|
+
when :image_preprocessing
|
|
940
|
+
@image_preprocessing = normalize_config(value, ImagePreprocessing)
|
|
941
|
+
when :postprocessor
|
|
942
|
+
@postprocessor = normalize_config(value, PostProcessor)
|
|
943
|
+
when :token_reduction
|
|
944
|
+
@token_reduction = normalize_config(value, TokenReduction)
|
|
945
|
+
when :keywords
|
|
946
|
+
@keywords = normalize_config(value, Keywords)
|
|
947
|
+
when :html_options
|
|
948
|
+
@html_options = normalize_config(value, HtmlOptions)
|
|
949
|
+
when :pages
|
|
950
|
+
@pages = normalize_config(value, PageConfig)
|
|
951
|
+
when :max_concurrent_extractions
|
|
952
|
+
@max_concurrent_extractions = value&.to_i
|
|
953
|
+
else
|
|
954
|
+
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
955
|
+
end
|
|
956
|
+
end
|
|
957
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
958
|
+
|
|
959
|
+
# Get a configuration field using hash-like syntax
|
|
960
|
+
#
|
|
961
|
+
# @param key [Symbol, String] Field name to get
|
|
962
|
+
# @return [Object, nil] The field value
|
|
963
|
+
#
|
|
964
|
+
# @example
|
|
965
|
+
# config = Extraction.new(use_cache: true)
|
|
966
|
+
# config[:use_cache] # => true
|
|
967
|
+
#
|
|
968
|
+
def [](key)
|
|
969
|
+
send(key.to_sym)
|
|
970
|
+
rescue NoMethodError
|
|
971
|
+
nil
|
|
972
|
+
end
|
|
973
|
+
|
|
908
974
|
private
|
|
909
975
|
|
|
910
976
|
def normalize_config(value, klass)
|
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -11,7 +11,7 @@ module Kreuzberg
|
|
|
11
11
|
# rubocop:disable Metrics/ClassLength
|
|
12
12
|
class Result
|
|
13
13
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
14
|
-
:detected_languages, :chunks, :images, :pages
|
|
14
|
+
:detected_languages, :chunks, :images, :pages, :elements
|
|
15
15
|
|
|
16
16
|
# @!attribute [r] cells
|
|
17
17
|
# @return [Array<Array<String>>] Table cells (2D array)
|
|
@@ -114,6 +114,68 @@ module Kreuzberg
|
|
|
114
114
|
end
|
|
115
115
|
end
|
|
116
116
|
|
|
117
|
+
# @!attribute [r] x0
|
|
118
|
+
# @return [Float] Left x-coordinate
|
|
119
|
+
# @!attribute [r] y0
|
|
120
|
+
# @return [Float] Bottom y-coordinate
|
|
121
|
+
# @!attribute [r] x1
|
|
122
|
+
# @return [Float] Right x-coordinate
|
|
123
|
+
# @!attribute [r] y1
|
|
124
|
+
# @return [Float] Top y-coordinate
|
|
125
|
+
ElementBoundingBox = Struct.new(:x0, :y0, :x1, :y1, keyword_init: true) do
|
|
126
|
+
def to_h
|
|
127
|
+
{ x0: x0, y0: y0, x1: x1, y1: y1 }
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# @!attribute [r] page_number
|
|
132
|
+
# @return [Integer, nil] Page number (1-indexed)
|
|
133
|
+
# @!attribute [r] filename
|
|
134
|
+
# @return [String, nil] Source filename or document name
|
|
135
|
+
# @!attribute [r] coordinates
|
|
136
|
+
# @return [ElementBoundingBox, nil] Bounding box coordinates if available
|
|
137
|
+
# @!attribute [r] element_index
|
|
138
|
+
# @return [Integer, nil] Position index in the element sequence
|
|
139
|
+
# @!attribute [r] additional
|
|
140
|
+
# @return [Hash<String, String>] Additional custom metadata
|
|
141
|
+
ElementMetadataStruct = Struct.new(
|
|
142
|
+
:page_number,
|
|
143
|
+
:filename,
|
|
144
|
+
:coordinates,
|
|
145
|
+
:element_index,
|
|
146
|
+
:additional,
|
|
147
|
+
keyword_init: true
|
|
148
|
+
) do
|
|
149
|
+
def to_h
|
|
150
|
+
{
|
|
151
|
+
page_number: page_number,
|
|
152
|
+
filename: filename,
|
|
153
|
+
coordinates: coordinates&.to_h,
|
|
154
|
+
element_index: element_index,
|
|
155
|
+
additional: additional
|
|
156
|
+
}
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# @!attribute [r] element_id
|
|
161
|
+
# @return [String] Unique element identifier
|
|
162
|
+
# @!attribute [r] element_type
|
|
163
|
+
# @return [String] Semantic type of the element
|
|
164
|
+
# @!attribute [r] text
|
|
165
|
+
# @return [String] Text content of the element
|
|
166
|
+
# @!attribute [r] metadata
|
|
167
|
+
# @return [ElementMetadataStruct] Metadata about the element
|
|
168
|
+
ElementStruct = Struct.new(:element_id, :element_type, :text, :metadata, keyword_init: true) do
|
|
169
|
+
def to_h
|
|
170
|
+
{
|
|
171
|
+
element_id: element_id,
|
|
172
|
+
element_type: element_type,
|
|
173
|
+
text: text,
|
|
174
|
+
metadata: metadata&.to_h
|
|
175
|
+
}
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
117
179
|
# Initialize from native hash result
|
|
118
180
|
#
|
|
119
181
|
# @param hash [Hash] Hash returned from native extension
|
|
@@ -128,6 +190,7 @@ module Kreuzberg
|
|
|
128
190
|
@chunks = parse_chunks(get_value(hash, 'chunks'))
|
|
129
191
|
@images = parse_images(get_value(hash, 'images'))
|
|
130
192
|
@pages = parse_pages(get_value(hash, 'pages'))
|
|
193
|
+
@elements = parse_elements(get_value(hash, 'elements'))
|
|
131
194
|
end
|
|
132
195
|
|
|
133
196
|
# Convert to hash
|
|
@@ -143,7 +206,8 @@ module Kreuzberg
|
|
|
143
206
|
detected_languages: @detected_languages,
|
|
144
207
|
chunks: serialize_chunks,
|
|
145
208
|
images: serialize_images,
|
|
146
|
-
pages: serialize_pages
|
|
209
|
+
pages: serialize_pages,
|
|
210
|
+
elements: serialize_elements
|
|
147
211
|
}
|
|
148
212
|
end
|
|
149
213
|
|
|
@@ -249,6 +313,10 @@ module Kreuzberg
|
|
|
249
313
|
@pages&.map(&:to_h)
|
|
250
314
|
end
|
|
251
315
|
|
|
316
|
+
def serialize_elements
|
|
317
|
+
@elements&.map(&:to_h)
|
|
318
|
+
end
|
|
319
|
+
|
|
252
320
|
def get_value(hash, key, default = nil)
|
|
253
321
|
hash[key] || hash[key.to_sym] || default
|
|
254
322
|
end
|
|
@@ -329,6 +397,43 @@ module Kreuzberg
|
|
|
329
397
|
)
|
|
330
398
|
end
|
|
331
399
|
end
|
|
400
|
+
|
|
401
|
+
def parse_elements(elements_data)
|
|
402
|
+
return nil if elements_data.nil?
|
|
403
|
+
|
|
404
|
+
elements_data.map { |element_hash| parse_element(element_hash) }
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
def parse_element(element_hash)
|
|
408
|
+
metadata_hash = element_hash['metadata'] || {}
|
|
409
|
+
coordinates = parse_element_coordinates(metadata_hash['coordinates'])
|
|
410
|
+
|
|
411
|
+
metadata = ElementMetadataStruct.new(
|
|
412
|
+
page_number: metadata_hash['page_number'],
|
|
413
|
+
filename: metadata_hash['filename'],
|
|
414
|
+
coordinates: coordinates,
|
|
415
|
+
element_index: metadata_hash['element_index'],
|
|
416
|
+
additional: metadata_hash['additional'] || {}
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
ElementStruct.new(
|
|
420
|
+
element_id: element_hash['element_id'],
|
|
421
|
+
element_type: element_hash['element_type'],
|
|
422
|
+
text: element_hash['text'],
|
|
423
|
+
metadata: metadata
|
|
424
|
+
)
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
def parse_element_coordinates(coordinates_data)
|
|
428
|
+
return nil if coordinates_data.nil?
|
|
429
|
+
|
|
430
|
+
ElementBoundingBox.new(
|
|
431
|
+
x0: coordinates_data['x0'].to_f,
|
|
432
|
+
y0: coordinates_data['y0'].to_f,
|
|
433
|
+
x1: coordinates_data['x1'].to_f,
|
|
434
|
+
y1: coordinates_data['y1'].to_f
|
|
435
|
+
)
|
|
436
|
+
end
|
|
332
437
|
end
|
|
333
438
|
# rubocop:enable Metrics/ClassLength
|
|
334
439
|
end
|
data/lib/kreuzberg/types.rb
CHANGED
|
@@ -3,6 +3,110 @@
|
|
|
3
3
|
require 'sorbet-runtime'
|
|
4
4
|
|
|
5
5
|
module Kreuzberg
|
|
6
|
+
# Semantic element type classification.
|
|
7
|
+
#
|
|
8
|
+
# Categorizes text content into semantic units for downstream processing.
|
|
9
|
+
# Supports the element types commonly found in Unstructured documents.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# type = Kreuzberg::ElementType::TITLE
|
|
13
|
+
#
|
|
14
|
+
ElementType = T.type_alias do
|
|
15
|
+
T.any(
|
|
16
|
+
'title',
|
|
17
|
+
'narrative_text',
|
|
18
|
+
'heading',
|
|
19
|
+
'list_item',
|
|
20
|
+
'table',
|
|
21
|
+
'image',
|
|
22
|
+
'page_break',
|
|
23
|
+
'code_block',
|
|
24
|
+
'block_quote',
|
|
25
|
+
'footer',
|
|
26
|
+
'header'
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Bounding box coordinates for element positioning.
|
|
31
|
+
#
|
|
32
|
+
# Represents rectangular coordinates for an element within a page.
|
|
33
|
+
#
|
|
34
|
+
# @example
|
|
35
|
+
# bbox = Kreuzberg::BoundingBox.new(
|
|
36
|
+
# x0: 10.0,
|
|
37
|
+
# y0: 20.0,
|
|
38
|
+
# x1: 100.0,
|
|
39
|
+
# y1: 50.0
|
|
40
|
+
# )
|
|
41
|
+
# puts "Width: #{bbox.x1 - bbox.x0}"
|
|
42
|
+
#
|
|
43
|
+
class BoundingBox < T::Struct
|
|
44
|
+
extend T::Sig
|
|
45
|
+
|
|
46
|
+
const :x0, Float
|
|
47
|
+
|
|
48
|
+
const :y0, Float
|
|
49
|
+
|
|
50
|
+
const :x1, Float
|
|
51
|
+
|
|
52
|
+
const :y1, Float
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Metadata for a semantic element.
|
|
56
|
+
#
|
|
57
|
+
# Provides contextual information about an extracted element including
|
|
58
|
+
# its position within the document and custom metadata fields.
|
|
59
|
+
#
|
|
60
|
+
# @example
|
|
61
|
+
# metadata = Kreuzberg::ElementMetadata.new(
|
|
62
|
+
# page_number: 1,
|
|
63
|
+
# filename: "document.pdf",
|
|
64
|
+
# coordinates: bbox,
|
|
65
|
+
# element_index: 5,
|
|
66
|
+
# additional: { "style" => "bold" }
|
|
67
|
+
# )
|
|
68
|
+
#
|
|
69
|
+
class ElementMetadata < T::Struct
|
|
70
|
+
extend T::Sig
|
|
71
|
+
|
|
72
|
+
const :page_number, T.nilable(Integer)
|
|
73
|
+
|
|
74
|
+
const :filename, T.nilable(String)
|
|
75
|
+
|
|
76
|
+
const :coordinates, T.nilable(BoundingBox)
|
|
77
|
+
|
|
78
|
+
const :element_index, T.nilable(Integer)
|
|
79
|
+
|
|
80
|
+
const :additional, T::Hash[String, String]
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Semantic element extracted from document.
|
|
84
|
+
#
|
|
85
|
+
# Represents a logical unit of content with semantic classification,
|
|
86
|
+
# unique identifier, and metadata for tracking origin and position.
|
|
87
|
+
# Compatible with Unstructured.io element format when output_format='element_based'.
|
|
88
|
+
#
|
|
89
|
+
# @example
|
|
90
|
+
# element = Kreuzberg::Element.new(
|
|
91
|
+
# element_id: "elem-abc123",
|
|
92
|
+
# element_type: "narrative_text",
|
|
93
|
+
# text: "This is the main content.",
|
|
94
|
+
# metadata: metadata
|
|
95
|
+
# )
|
|
96
|
+
# puts "#{element.element_type}: #{element.text}"
|
|
97
|
+
#
|
|
98
|
+
class Element < T::Struct
|
|
99
|
+
extend T::Sig
|
|
100
|
+
|
|
101
|
+
const :element_id, String
|
|
102
|
+
|
|
103
|
+
const :element_type, String
|
|
104
|
+
|
|
105
|
+
const :text, String
|
|
106
|
+
|
|
107
|
+
const :metadata, ElementMetadata
|
|
108
|
+
end
|
|
109
|
+
|
|
6
110
|
# Header/Heading metadata
|
|
7
111
|
#
|
|
8
112
|
# Represents a heading element found in the HTML document
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED