kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,1058 @@
|
|
|
1
|
+
//! Configuration parsing and conversion for Ruby bindings
|
|
2
|
+
//!
|
|
3
|
+
//! Handles conversion between Ruby Hash configurations and Rust config types.
|
|
4
|
+
//! Includes parsing for all nested configuration structures.
|
|
5
|
+
|
|
6
|
+
use crate::error_handling::{runtime_error, validation_error};
|
|
7
|
+
use crate::helpers::{get_kw, json_value_to_ruby, ruby_value_to_json, symbol_to_string};
|
|
8
|
+
|
|
9
|
+
use html_to_markdown_rs::options::{
|
|
10
|
+
CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
|
|
11
|
+
PreprocessingPreset,
|
|
12
|
+
};
|
|
13
|
+
use html_to_markdown_rs::WhitespaceMode;
|
|
14
|
+
use kreuzberg::core::config::PageConfig;
|
|
15
|
+
use kreuzberg::keywords::{
|
|
16
|
+
KeywordAlgorithm as RustKeywordAlgorithm, KeywordConfig as RustKeywordConfig, RakeParams as RustRakeParams,
|
|
17
|
+
YakeParams as RustYakeParams,
|
|
18
|
+
};
|
|
19
|
+
use kreuzberg::types::TesseractConfig as RustTesseractConfig;
|
|
20
|
+
use kreuzberg::pdf::HierarchyConfig;
|
|
21
|
+
use kreuzberg::{
|
|
22
|
+
ChunkingConfig, EmbeddingConfig, ExtractionConfig, ImageExtractionConfig, ImagePreprocessingConfig,
|
|
23
|
+
LanguageDetectionConfig, OcrConfig, OutputFormat, PdfConfig, PostProcessorConfig, TokenReductionConfig,
|
|
24
|
+
};
|
|
25
|
+
use magnus::{Error, RArray, RHash, Ruby, TryConvert, Value};
|
|
26
|
+
use magnus::value::ReprValue;
|
|
27
|
+
use std::fs;
|
|
28
|
+
|
|
29
|
+
/// Parse OcrConfig from Ruby Hash
|
|
30
|
+
pub fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
|
|
31
|
+
let backend = if let Some(val) = get_kw(ruby, hash, "backend") {
|
|
32
|
+
symbol_to_string(val)?
|
|
33
|
+
} else {
|
|
34
|
+
"tesseract".to_string()
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
let language = if let Some(val) = get_kw(ruby, hash, "language") {
|
|
38
|
+
symbol_to_string(val)?
|
|
39
|
+
} else {
|
|
40
|
+
"eng".to_string()
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
let mut config = OcrConfig {
|
|
44
|
+
backend,
|
|
45
|
+
language,
|
|
46
|
+
tesseract_config: None,
|
|
47
|
+
output_format: None,
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
if let Some(val) = get_kw(ruby, hash, "tesseract_config")
|
|
51
|
+
&& !val.is_nil()
|
|
52
|
+
{
|
|
53
|
+
let tc_json = ruby_value_to_json(val)?;
|
|
54
|
+
let parsed: RustTesseractConfig =
|
|
55
|
+
serde_json::from_value(tc_json).map_err(|e| runtime_error(format!("Invalid tesseract_config: {}", e)))?;
|
|
56
|
+
config.tesseract_config = Some(parsed);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
Ok(config)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/// Parse ChunkingConfig from Ruby Hash
|
|
63
|
+
pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig, Error> {
|
|
64
|
+
let max_chars = if let Some(val) = get_kw(ruby, hash, "max_chars") {
|
|
65
|
+
usize::try_convert(val)?
|
|
66
|
+
} else {
|
|
67
|
+
1000
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
let max_overlap = if let Some(val) = get_kw(ruby, hash, "max_overlap") {
|
|
71
|
+
usize::try_convert(val)?
|
|
72
|
+
} else {
|
|
73
|
+
200
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
let preset = if let Some(val) = get_kw(ruby, hash, "preset")
|
|
77
|
+
&& !val.is_nil()
|
|
78
|
+
{
|
|
79
|
+
Some(symbol_to_string(val)?)
|
|
80
|
+
} else {
|
|
81
|
+
None
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
let embedding = if let Some(val) = get_kw(ruby, hash, "embedding")
|
|
85
|
+
&& !val.is_nil()
|
|
86
|
+
{
|
|
87
|
+
let json_value = ruby_value_to_json(val)?;
|
|
88
|
+
let parsed: EmbeddingConfig = serde_json::from_value(json_value)
|
|
89
|
+
.map_err(|e| runtime_error(format!("Invalid chunking.embedding: {}", e)))?;
|
|
90
|
+
Some(parsed)
|
|
91
|
+
} else {
|
|
92
|
+
None
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
let config = ChunkingConfig {
|
|
96
|
+
max_chars,
|
|
97
|
+
max_overlap,
|
|
98
|
+
embedding,
|
|
99
|
+
preset,
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
Ok(config)
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/// Parse LanguageDetectionConfig from Ruby Hash
|
|
106
|
+
pub fn parse_language_detection_config(ruby: &Ruby, hash: RHash) -> Result<LanguageDetectionConfig, Error> {
|
|
107
|
+
let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
|
|
108
|
+
bool::try_convert(val)?
|
|
109
|
+
} else {
|
|
110
|
+
true
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
let min_confidence = if let Some(val) = get_kw(ruby, hash, "min_confidence") {
|
|
114
|
+
f64::try_convert(val)?
|
|
115
|
+
} else {
|
|
116
|
+
0.8
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
let detect_multiple = if let Some(val) = get_kw(ruby, hash, "detect_multiple") {
|
|
120
|
+
bool::try_convert(val)?
|
|
121
|
+
} else {
|
|
122
|
+
false
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
let config = LanguageDetectionConfig {
|
|
126
|
+
enabled,
|
|
127
|
+
min_confidence,
|
|
128
|
+
detect_multiple,
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
Ok(config)
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/// Parse HierarchyConfig from Ruby Hash
|
|
135
|
+
pub fn parse_hierarchy_config(ruby: &Ruby, hash: RHash) -> Result<HierarchyConfig, Error> {
|
|
136
|
+
let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
|
|
137
|
+
bool::try_convert(val)?
|
|
138
|
+
} else {
|
|
139
|
+
true
|
|
140
|
+
};
|
|
141
|
+
|
|
142
|
+
let k_clusters = if let Some(val) = get_kw(ruby, hash, "k_clusters") {
|
|
143
|
+
usize::try_convert(val)?
|
|
144
|
+
} else {
|
|
145
|
+
6
|
|
146
|
+
};
|
|
147
|
+
|
|
148
|
+
let include_bbox = if let Some(val) = get_kw(ruby, hash, "include_bbox") {
|
|
149
|
+
bool::try_convert(val)?
|
|
150
|
+
} else {
|
|
151
|
+
true
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
let ocr_coverage_threshold = if let Some(val) = get_kw(ruby, hash, "ocr_coverage_threshold") {
|
|
155
|
+
if !val.is_nil() {
|
|
156
|
+
Some(f64::try_convert(val)? as f32)
|
|
157
|
+
} else {
|
|
158
|
+
None
|
|
159
|
+
}
|
|
160
|
+
} else {
|
|
161
|
+
None
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
let config = HierarchyConfig {
|
|
165
|
+
enabled,
|
|
166
|
+
k_clusters,
|
|
167
|
+
include_bbox,
|
|
168
|
+
ocr_coverage_threshold,
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
Ok(config)
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/// Parse PdfConfig from Ruby Hash
|
|
175
|
+
pub fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
|
|
176
|
+
let extract_images = if let Some(val) = get_kw(ruby, hash, "extract_images") {
|
|
177
|
+
bool::try_convert(val)?
|
|
178
|
+
} else {
|
|
179
|
+
false
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
let passwords = if let Some(val) = get_kw(ruby, hash, "passwords") {
|
|
183
|
+
if !val.is_nil() {
|
|
184
|
+
let arr = RArray::try_convert(val)?;
|
|
185
|
+
Some(arr.to_vec::<String>()?)
|
|
186
|
+
} else {
|
|
187
|
+
None
|
|
188
|
+
}
|
|
189
|
+
} else {
|
|
190
|
+
None
|
|
191
|
+
};
|
|
192
|
+
|
|
193
|
+
let extract_metadata = if let Some(val) = get_kw(ruby, hash, "extract_metadata") {
|
|
194
|
+
bool::try_convert(val)?
|
|
195
|
+
} else {
|
|
196
|
+
true
|
|
197
|
+
};
|
|
198
|
+
|
|
199
|
+
let hierarchy = if let Some(val) = get_kw(ruby, hash, "hierarchy") {
|
|
200
|
+
if !val.is_nil() {
|
|
201
|
+
let h_hash = RHash::try_convert(val)?;
|
|
202
|
+
Some(parse_hierarchy_config(ruby, h_hash)?)
|
|
203
|
+
} else {
|
|
204
|
+
None
|
|
205
|
+
}
|
|
206
|
+
} else {
|
|
207
|
+
None
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
let config = PdfConfig {
|
|
211
|
+
extract_images,
|
|
212
|
+
passwords,
|
|
213
|
+
extract_metadata,
|
|
214
|
+
hierarchy,
|
|
215
|
+
};
|
|
216
|
+
|
|
217
|
+
Ok(config)
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/// Parse ImageExtractionConfig from Ruby Hash
|
|
221
|
+
pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageExtractionConfig, Error> {
|
|
222
|
+
let extract_images = if let Some(val) = get_kw(ruby, hash, "extract_images") {
|
|
223
|
+
bool::try_convert(val)?
|
|
224
|
+
} else {
|
|
225
|
+
true
|
|
226
|
+
};
|
|
227
|
+
|
|
228
|
+
let target_dpi = if let Some(val) = get_kw(ruby, hash, "target_dpi") {
|
|
229
|
+
i32::try_convert(val)?
|
|
230
|
+
} else {
|
|
231
|
+
300
|
|
232
|
+
};
|
|
233
|
+
|
|
234
|
+
let max_image_dimension = if let Some(val) = get_kw(ruby, hash, "max_image_dimension") {
|
|
235
|
+
i32::try_convert(val)?
|
|
236
|
+
} else {
|
|
237
|
+
4096
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
let auto_adjust_dpi = if let Some(val) = get_kw(ruby, hash, "auto_adjust_dpi") {
|
|
241
|
+
bool::try_convert(val)?
|
|
242
|
+
} else {
|
|
243
|
+
true
|
|
244
|
+
};
|
|
245
|
+
|
|
246
|
+
let min_dpi = if let Some(val) = get_kw(ruby, hash, "min_dpi") {
|
|
247
|
+
i32::try_convert(val)?
|
|
248
|
+
} else {
|
|
249
|
+
72
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
let max_dpi = if let Some(val) = get_kw(ruby, hash, "max_dpi") {
|
|
253
|
+
i32::try_convert(val)?
|
|
254
|
+
} else {
|
|
255
|
+
600
|
|
256
|
+
};
|
|
257
|
+
|
|
258
|
+
let config = ImageExtractionConfig {
|
|
259
|
+
extract_images,
|
|
260
|
+
target_dpi,
|
|
261
|
+
max_image_dimension,
|
|
262
|
+
auto_adjust_dpi,
|
|
263
|
+
min_dpi,
|
|
264
|
+
max_dpi,
|
|
265
|
+
};
|
|
266
|
+
|
|
267
|
+
Ok(config)
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/// Parse ImagePreprocessingConfig from Ruby Hash
|
|
271
|
+
///
|
|
272
|
+
/// Note: Currently not used in ExtractionConfig but provided for completeness.
|
|
273
|
+
/// ImagePreprocessingConfig is typically used in OCR operations.
|
|
274
|
+
#[allow(dead_code)]
|
|
275
|
+
pub fn parse_image_preprocessing_config(ruby: &Ruby, hash: RHash) -> Result<ImagePreprocessingConfig, Error> {
|
|
276
|
+
let target_dpi = if let Some(val) = get_kw(ruby, hash, "target_dpi") {
|
|
277
|
+
i32::try_convert(val)?
|
|
278
|
+
} else {
|
|
279
|
+
300
|
|
280
|
+
};
|
|
281
|
+
|
|
282
|
+
let auto_rotate = if let Some(val) = get_kw(ruby, hash, "auto_rotate") {
|
|
283
|
+
bool::try_convert(val)?
|
|
284
|
+
} else {
|
|
285
|
+
true
|
|
286
|
+
};
|
|
287
|
+
|
|
288
|
+
let deskew = if let Some(val) = get_kw(ruby, hash, "deskew") {
|
|
289
|
+
bool::try_convert(val)?
|
|
290
|
+
} else {
|
|
291
|
+
true
|
|
292
|
+
};
|
|
293
|
+
|
|
294
|
+
let denoise = if let Some(val) = get_kw(ruby, hash, "denoise") {
|
|
295
|
+
bool::try_convert(val)?
|
|
296
|
+
} else {
|
|
297
|
+
false
|
|
298
|
+
};
|
|
299
|
+
|
|
300
|
+
let contrast_enhance = if let Some(val) = get_kw(ruby, hash, "contrast_enhance") {
|
|
301
|
+
bool::try_convert(val)?
|
|
302
|
+
} else {
|
|
303
|
+
false
|
|
304
|
+
};
|
|
305
|
+
|
|
306
|
+
let binarization_method = if let Some(val) = get_kw(ruby, hash, "binarization_method") {
|
|
307
|
+
symbol_to_string(val)?
|
|
308
|
+
} else {
|
|
309
|
+
"otsu".to_string()
|
|
310
|
+
};
|
|
311
|
+
|
|
312
|
+
let invert_colors = if let Some(val) = get_kw(ruby, hash, "invert_colors") {
|
|
313
|
+
bool::try_convert(val)?
|
|
314
|
+
} else {
|
|
315
|
+
false
|
|
316
|
+
};
|
|
317
|
+
|
|
318
|
+
let config = ImagePreprocessingConfig {
|
|
319
|
+
target_dpi,
|
|
320
|
+
auto_rotate,
|
|
321
|
+
deskew,
|
|
322
|
+
denoise,
|
|
323
|
+
contrast_enhance,
|
|
324
|
+
binarization_method,
|
|
325
|
+
invert_colors,
|
|
326
|
+
};
|
|
327
|
+
|
|
328
|
+
Ok(config)
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
/// Parse PostProcessorConfig from Ruby Hash
|
|
332
|
+
pub fn parse_postprocessor_config(ruby: &Ruby, hash: RHash) -> Result<PostProcessorConfig, Error> {
|
|
333
|
+
let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
|
|
334
|
+
bool::try_convert(val)?
|
|
335
|
+
} else {
|
|
336
|
+
true
|
|
337
|
+
};
|
|
338
|
+
|
|
339
|
+
let enabled_processors = if let Some(val) = get_kw(ruby, hash, "enabled_processors")
|
|
340
|
+
&& !val.is_nil()
|
|
341
|
+
{
|
|
342
|
+
let arr = RArray::try_convert(val)?;
|
|
343
|
+
Some(arr.to_vec::<String>()?)
|
|
344
|
+
} else {
|
|
345
|
+
None
|
|
346
|
+
};
|
|
347
|
+
|
|
348
|
+
let disabled_processors = if let Some(val) = get_kw(ruby, hash, "disabled_processors")
|
|
349
|
+
&& !val.is_nil()
|
|
350
|
+
{
|
|
351
|
+
let arr = RArray::try_convert(val)?;
|
|
352
|
+
Some(arr.to_vec::<String>()?)
|
|
353
|
+
} else {
|
|
354
|
+
None
|
|
355
|
+
};
|
|
356
|
+
|
|
357
|
+
let config = PostProcessorConfig {
|
|
358
|
+
enabled,
|
|
359
|
+
enabled_processors,
|
|
360
|
+
disabled_processors,
|
|
361
|
+
enabled_set: None,
|
|
362
|
+
disabled_set: None,
|
|
363
|
+
};
|
|
364
|
+
|
|
365
|
+
Ok(config)
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/// Parse TokenReductionConfig from Ruby Hash
|
|
369
|
+
pub fn parse_token_reduction_config(ruby: &Ruby, hash: RHash) -> Result<TokenReductionConfig, Error> {
|
|
370
|
+
let mode = if let Some(val) = get_kw(ruby, hash, "mode") {
|
|
371
|
+
symbol_to_string(val)?
|
|
372
|
+
} else {
|
|
373
|
+
"off".to_string()
|
|
374
|
+
};
|
|
375
|
+
|
|
376
|
+
let preserve_important_words = if let Some(val) = get_kw(ruby, hash, "preserve_important_words") {
|
|
377
|
+
bool::try_convert(val)?
|
|
378
|
+
} else {
|
|
379
|
+
true
|
|
380
|
+
};
|
|
381
|
+
|
|
382
|
+
let config = TokenReductionConfig {
|
|
383
|
+
mode,
|
|
384
|
+
preserve_important_words,
|
|
385
|
+
};
|
|
386
|
+
|
|
387
|
+
Ok(config)
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
/// Parse KeywordConfig from Ruby Hash
|
|
391
|
+
pub fn parse_keyword_config(ruby: &Ruby, hash: RHash) -> Result<RustKeywordConfig, Error> {
|
|
392
|
+
let mut config = RustKeywordConfig::default();
|
|
393
|
+
|
|
394
|
+
if let Some(val) = get_kw(ruby, hash, "algorithm") {
|
|
395
|
+
let algo = symbol_to_string(val)?;
|
|
396
|
+
config.algorithm = match algo.to_lowercase().as_str() {
|
|
397
|
+
"yake" => RustKeywordAlgorithm::Yake,
|
|
398
|
+
"rake" => RustKeywordAlgorithm::Rake,
|
|
399
|
+
other => {
|
|
400
|
+
return Err(runtime_error(format!(
|
|
401
|
+
"Invalid keywords.algorithm '{}', expected 'yake' or 'rake'",
|
|
402
|
+
other
|
|
403
|
+
)));
|
|
404
|
+
}
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
if let Some(val) = get_kw(ruby, hash, "max_keywords") {
|
|
409
|
+
config.max_keywords = usize::try_convert(val)?;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
if let Some(val) = get_kw(ruby, hash, "min_score") {
|
|
413
|
+
config.min_score = f64::try_convert(val)? as f32;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
if let Some(val) = get_kw(ruby, hash, "ngram_range") {
|
|
417
|
+
let ary = RArray::try_convert(val)?;
|
|
418
|
+
if ary.len() == 2 {
|
|
419
|
+
let values = ary.to_vec::<i64>()?;
|
|
420
|
+
config.ngram_range = (values[0] as usize, values[1] as usize);
|
|
421
|
+
} else {
|
|
422
|
+
return Err(runtime_error("keywords.ngram_range must have exactly two values"));
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
if let Some(val) = get_kw(ruby, hash, "language")
|
|
427
|
+
&& !val.is_nil()
|
|
428
|
+
{
|
|
429
|
+
config.language = Some(symbol_to_string(val)?);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
if let Some(val) = get_kw(ruby, hash, "yake_params")
|
|
433
|
+
&& !val.is_nil()
|
|
434
|
+
{
|
|
435
|
+
let yake_hash = RHash::try_convert(val)?;
|
|
436
|
+
let window = if let Some(window_val) = get_kw(ruby, yake_hash, "window_size") {
|
|
437
|
+
usize::try_convert(window_val)?
|
|
438
|
+
} else {
|
|
439
|
+
2
|
|
440
|
+
};
|
|
441
|
+
config.yake_params = Some(RustYakeParams { window_size: window });
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
if let Some(val) = get_kw(ruby, hash, "rake_params")
|
|
445
|
+
&& !val.is_nil()
|
|
446
|
+
{
|
|
447
|
+
let rake_hash = RHash::try_convert(val)?;
|
|
448
|
+
let mut params = RustRakeParams::default();
|
|
449
|
+
if let Some(val) = get_kw(ruby, rake_hash, "min_word_length") {
|
|
450
|
+
params.min_word_length = usize::try_convert(val)?;
|
|
451
|
+
}
|
|
452
|
+
if let Some(val) = get_kw(ruby, rake_hash, "max_words_per_phrase") {
|
|
453
|
+
params.max_words_per_phrase = usize::try_convert(val)?;
|
|
454
|
+
}
|
|
455
|
+
config.rake_params = Some(params);
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
Ok(config)
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
/// Parse HTML conversion options from Ruby Hash
|
|
462
|
+
pub fn parse_html_options(ruby: &Ruby, hash: RHash) -> Result<ConversionOptions, Error> {
|
|
463
|
+
let mut options = ConversionOptions::default();
|
|
464
|
+
|
|
465
|
+
if let Some(val) = get_kw(ruby, hash, "heading_style") {
|
|
466
|
+
let style = symbol_to_string(val)?;
|
|
467
|
+
options.heading_style = match style.to_lowercase().as_str() {
|
|
468
|
+
"atx" => HeadingStyle::Atx,
|
|
469
|
+
"underlined" => HeadingStyle::Underlined,
|
|
470
|
+
"atx_closed" | "atx-closed" => HeadingStyle::AtxClosed,
|
|
471
|
+
other => return Err(runtime_error(format!("Invalid html_options.heading_style '{}'", other))),
|
|
472
|
+
};
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
if let Some(val) = get_kw(ruby, hash, "list_indent_type") {
|
|
476
|
+
let val_str = symbol_to_string(val)?;
|
|
477
|
+
options.list_indent_type = match val_str.to_lowercase().as_str() {
|
|
478
|
+
"spaces" => ListIndentType::Spaces,
|
|
479
|
+
"tabs" => ListIndentType::Tabs,
|
|
480
|
+
other => {
|
|
481
|
+
return Err(runtime_error(format!(
|
|
482
|
+
"Invalid html_options.list_indent_type '{}'",
|
|
483
|
+
other
|
|
484
|
+
)));
|
|
485
|
+
}
|
|
486
|
+
};
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
if let Some(val) = get_kw(ruby, hash, "list_indent_width") {
|
|
490
|
+
options.list_indent_width = usize::try_convert(val)?;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
if let Some(val) = get_kw(ruby, hash, "bullets") {
|
|
494
|
+
options.bullets = String::try_convert(val)?;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
if let Some(val) = get_kw(ruby, hash, "strong_em_symbol") {
|
|
498
|
+
let symbol = String::try_convert(val)?;
|
|
499
|
+
let mut chars = symbol.chars();
|
|
500
|
+
options.strong_em_symbol = chars
|
|
501
|
+
.next()
|
|
502
|
+
.ok_or_else(|| runtime_error("html_options.strong_em_symbol must not be empty"))?;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
if let Some(val) = get_kw(ruby, hash, "escape_asterisks") {
|
|
506
|
+
options.escape_asterisks = bool::try_convert(val)?;
|
|
507
|
+
}
|
|
508
|
+
if let Some(val) = get_kw(ruby, hash, "escape_underscores") {
|
|
509
|
+
options.escape_underscores = bool::try_convert(val)?;
|
|
510
|
+
}
|
|
511
|
+
if let Some(val) = get_kw(ruby, hash, "escape_misc") {
|
|
512
|
+
options.escape_misc = bool::try_convert(val)?;
|
|
513
|
+
}
|
|
514
|
+
if let Some(val) = get_kw(ruby, hash, "escape_ascii") {
|
|
515
|
+
options.escape_ascii = bool::try_convert(val)?;
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
if let Some(val) = get_kw(ruby, hash, "code_language") {
|
|
519
|
+
options.code_language = String::try_convert(val)?;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
if let Some(val) = get_kw(ruby, hash, "autolinks") {
|
|
523
|
+
options.autolinks = bool::try_convert(val)?;
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
if let Some(val) = get_kw(ruby, hash, "default_title") {
|
|
527
|
+
options.default_title = bool::try_convert(val)?;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
if let Some(val) = get_kw(ruby, hash, "br_in_tables") {
|
|
531
|
+
options.br_in_tables = bool::try_convert(val)?;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
if let Some(val) = get_kw(ruby, hash, "hocr_spatial_tables") {
|
|
535
|
+
options.hocr_spatial_tables = bool::try_convert(val)?;
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
if let Some(val) = get_kw(ruby, hash, "highlight_style") {
|
|
539
|
+
let style = symbol_to_string(val)?;
|
|
540
|
+
options.highlight_style = match style.to_lowercase().as_str() {
|
|
541
|
+
"double_equal" | "double-equal" => HighlightStyle::DoubleEqual,
|
|
542
|
+
"html" => HighlightStyle::Html,
|
|
543
|
+
"bold" => HighlightStyle::Bold,
|
|
544
|
+
"none" => HighlightStyle::None,
|
|
545
|
+
other => {
|
|
546
|
+
return Err(runtime_error(format!(
|
|
547
|
+
"Invalid html_options.highlight_style '{}'",
|
|
548
|
+
other
|
|
549
|
+
)));
|
|
550
|
+
}
|
|
551
|
+
};
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
if let Some(val) = get_kw(ruby, hash, "extract_metadata") {
|
|
555
|
+
options.extract_metadata = bool::try_convert(val)?;
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
if let Some(val) = get_kw(ruby, hash, "whitespace_mode") {
|
|
559
|
+
let mode = symbol_to_string(val)?;
|
|
560
|
+
options.whitespace_mode = match mode.to_lowercase().as_str() {
|
|
561
|
+
"normalized" => WhitespaceMode::Normalized,
|
|
562
|
+
"strict" => WhitespaceMode::Strict,
|
|
563
|
+
other => {
|
|
564
|
+
return Err(runtime_error(format!(
|
|
565
|
+
"Invalid html_options.whitespace_mode '{}'",
|
|
566
|
+
other
|
|
567
|
+
)));
|
|
568
|
+
}
|
|
569
|
+
};
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
if let Some(val) = get_kw(ruby, hash, "strip_newlines") {
|
|
573
|
+
options.strip_newlines = bool::try_convert(val)?;
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
if let Some(val) = get_kw(ruby, hash, "wrap") {
|
|
577
|
+
options.wrap = bool::try_convert(val)?;
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
if let Some(val) = get_kw(ruby, hash, "wrap_width") {
|
|
581
|
+
options.wrap_width = usize::try_convert(val)?;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
if let Some(val) = get_kw(ruby, hash, "convert_as_inline") {
|
|
585
|
+
options.convert_as_inline = bool::try_convert(val)?;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
if let Some(val) = get_kw(ruby, hash, "sub_symbol") {
|
|
589
|
+
options.sub_symbol = String::try_convert(val)?;
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
if let Some(val) = get_kw(ruby, hash, "sup_symbol") {
|
|
593
|
+
options.sup_symbol = String::try_convert(val)?;
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
if let Some(val) = get_kw(ruby, hash, "newline_style") {
|
|
597
|
+
let style = symbol_to_string(val)?;
|
|
598
|
+
options.newline_style = match style.to_lowercase().as_str() {
|
|
599
|
+
"spaces" => NewlineStyle::Spaces,
|
|
600
|
+
"backslash" => NewlineStyle::Backslash,
|
|
601
|
+
other => return Err(runtime_error(format!("Invalid html_options.newline_style '{}'", other))),
|
|
602
|
+
};
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
if let Some(val) = get_kw(ruby, hash, "code_block_style") {
|
|
606
|
+
let style = symbol_to_string(val)?;
|
|
607
|
+
options.code_block_style = match style.to_lowercase().as_str() {
|
|
608
|
+
"indented" => CodeBlockStyle::Indented,
|
|
609
|
+
"backticks" => CodeBlockStyle::Backticks,
|
|
610
|
+
"tildes" => CodeBlockStyle::Tildes,
|
|
611
|
+
other => {
|
|
612
|
+
return Err(runtime_error(format!(
|
|
613
|
+
"Invalid html_options.code_block_style '{}'",
|
|
614
|
+
other
|
|
615
|
+
)));
|
|
616
|
+
}
|
|
617
|
+
};
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
if let Some(val) = get_kw(ruby, hash, "keep_inline_images_in") {
|
|
621
|
+
let arr = RArray::try_convert(val)?;
|
|
622
|
+
options.keep_inline_images_in = arr.to_vec::<String>()?;
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
if let Some(val) = get_kw(ruby, hash, "encoding") {
|
|
626
|
+
options.encoding = String::try_convert(val)?;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
if let Some(val) = get_kw(ruby, hash, "debug") {
|
|
630
|
+
options.debug = bool::try_convert(val)?;
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
if let Some(val) = get_kw(ruby, hash, "strip_tags") {
|
|
634
|
+
let arr = RArray::try_convert(val)?;
|
|
635
|
+
options.strip_tags = arr.to_vec::<String>()?;
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
if let Some(val) = get_kw(ruby, hash, "preserve_tags") {
|
|
639
|
+
let arr = RArray::try_convert(val)?;
|
|
640
|
+
options.preserve_tags = arr.to_vec::<String>()?;
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
if let Some(val) = get_kw(ruby, hash, "preprocessing")
|
|
644
|
+
&& !val.is_nil()
|
|
645
|
+
{
|
|
646
|
+
let pre_hash = RHash::try_convert(val)?;
|
|
647
|
+
let mut preprocessing = options.preprocessing.clone();
|
|
648
|
+
if let Some(v) = get_kw(ruby, pre_hash, "enabled") {
|
|
649
|
+
preprocessing.enabled = bool::try_convert(v)?;
|
|
650
|
+
}
|
|
651
|
+
if let Some(v) = get_kw(ruby, pre_hash, "preset") {
|
|
652
|
+
let preset = symbol_to_string(v)?;
|
|
653
|
+
preprocessing.preset = match preset.to_lowercase().as_str() {
|
|
654
|
+
"minimal" => PreprocessingPreset::Minimal,
|
|
655
|
+
"standard" => PreprocessingPreset::Standard,
|
|
656
|
+
"aggressive" => PreprocessingPreset::Aggressive,
|
|
657
|
+
other => {
|
|
658
|
+
return Err(runtime_error(format!(
|
|
659
|
+
"Invalid html_options.preprocessing.preset '{}'",
|
|
660
|
+
other
|
|
661
|
+
)));
|
|
662
|
+
}
|
|
663
|
+
};
|
|
664
|
+
}
|
|
665
|
+
if let Some(v) = get_kw(ruby, pre_hash, "remove_navigation") {
|
|
666
|
+
preprocessing.remove_navigation = bool::try_convert(v)?;
|
|
667
|
+
}
|
|
668
|
+
if let Some(v) = get_kw(ruby, pre_hash, "remove_forms") {
|
|
669
|
+
preprocessing.remove_forms = bool::try_convert(v)?;
|
|
670
|
+
}
|
|
671
|
+
options.preprocessing = preprocessing;
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
Ok(options)
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
/// Convert KeywordAlgorithm to string
|
|
678
|
+
#[allow(dead_code)]
|
|
679
|
+
pub fn keyword_algorithm_to_str(algo: RustKeywordAlgorithm) -> &'static str {
|
|
680
|
+
match algo {
|
|
681
|
+
RustKeywordAlgorithm::Yake => "yake",
|
|
682
|
+
RustKeywordAlgorithm::Rake => "rake",
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
/// Convert KeywordConfig to Ruby Hash
|
|
687
|
+
#[allow(dead_code)]
|
|
688
|
+
pub fn keyword_config_to_ruby_hash(ruby: &Ruby, config: &RustKeywordConfig) -> Result<RHash, Error> {
|
|
689
|
+
let hash = ruby.hash_new();
|
|
690
|
+
hash.aset("algorithm", keyword_algorithm_to_str(config.algorithm))?;
|
|
691
|
+
hash.aset("max_keywords", config.max_keywords as i64)?;
|
|
692
|
+
hash.aset("min_score", config.min_score)?;
|
|
693
|
+
hash.aset("language", config.language.clone().unwrap_or_default())?;
|
|
694
|
+
|
|
695
|
+
let range_array = ruby.ary_new();
|
|
696
|
+
range_array.push(config.ngram_range.0 as i64)?;
|
|
697
|
+
range_array.push(config.ngram_range.1 as i64)?;
|
|
698
|
+
hash.aset("ngram_range", range_array)?;
|
|
699
|
+
|
|
700
|
+
if let Some(yake) = &config.yake_params {
|
|
701
|
+
let yake_hash = ruby.hash_new();
|
|
702
|
+
yake_hash.aset("window_size", yake.window_size as i64)?;
|
|
703
|
+
hash.aset("yake_params", yake_hash)?;
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
if let Some(rake) = &config.rake_params {
|
|
707
|
+
let rake_hash = ruby.hash_new();
|
|
708
|
+
rake_hash.aset("min_word_length", rake.min_word_length as i64)?;
|
|
709
|
+
rake_hash.aset("max_words_per_phrase", rake.max_words_per_phrase as i64)?;
|
|
710
|
+
hash.aset("rake_params", rake_hash)?;
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
Ok(hash)
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
/// Convert HTML conversion options to Ruby Hash
|
|
717
|
+
#[allow(dead_code)]
|
|
718
|
+
pub fn html_options_to_ruby_hash(ruby: &Ruby, options: &ConversionOptions) -> Result<RHash, Error> {
|
|
719
|
+
let hash = ruby.hash_new();
|
|
720
|
+
hash.aset(
|
|
721
|
+
"heading_style",
|
|
722
|
+
match options.heading_style {
|
|
723
|
+
HeadingStyle::Atx => "atx",
|
|
724
|
+
HeadingStyle::Underlined => "underlined",
|
|
725
|
+
HeadingStyle::AtxClosed => "atx_closed",
|
|
726
|
+
},
|
|
727
|
+
)?;
|
|
728
|
+
hash.aset(
|
|
729
|
+
"list_indent_type",
|
|
730
|
+
match options.list_indent_type {
|
|
731
|
+
ListIndentType::Spaces => "spaces",
|
|
732
|
+
ListIndentType::Tabs => "tabs",
|
|
733
|
+
},
|
|
734
|
+
)?;
|
|
735
|
+
hash.aset("list_indent_width", options.list_indent_width as i64)?;
|
|
736
|
+
hash.aset("bullets", options.bullets.clone())?;
|
|
737
|
+
hash.aset("strong_em_symbol", options.strong_em_symbol.to_string())?;
|
|
738
|
+
hash.aset("escape_asterisks", options.escape_asterisks)?;
|
|
739
|
+
hash.aset("escape_underscores", options.escape_underscores)?;
|
|
740
|
+
hash.aset("escape_misc", options.escape_misc)?;
|
|
741
|
+
hash.aset("escape_ascii", options.escape_ascii)?;
|
|
742
|
+
hash.aset("code_language", options.code_language.clone())?;
|
|
743
|
+
hash.aset("autolinks", options.autolinks)?;
|
|
744
|
+
hash.aset("default_title", options.default_title)?;
|
|
745
|
+
hash.aset("br_in_tables", options.br_in_tables)?;
|
|
746
|
+
hash.aset("hocr_spatial_tables", options.hocr_spatial_tables)?;
|
|
747
|
+
hash.aset(
|
|
748
|
+
"highlight_style",
|
|
749
|
+
match options.highlight_style {
|
|
750
|
+
HighlightStyle::DoubleEqual => "double_equal",
|
|
751
|
+
HighlightStyle::Html => "html",
|
|
752
|
+
HighlightStyle::Bold => "bold",
|
|
753
|
+
HighlightStyle::None => "none",
|
|
754
|
+
},
|
|
755
|
+
)?;
|
|
756
|
+
hash.aset("extract_metadata", options.extract_metadata)?;
|
|
757
|
+
hash.aset(
|
|
758
|
+
"whitespace_mode",
|
|
759
|
+
match options.whitespace_mode {
|
|
760
|
+
WhitespaceMode::Normalized => "normalized",
|
|
761
|
+
WhitespaceMode::Strict => "strict",
|
|
762
|
+
},
|
|
763
|
+
)?;
|
|
764
|
+
hash.aset("strip_newlines", options.strip_newlines)?;
|
|
765
|
+
hash.aset("wrap", options.wrap)?;
|
|
766
|
+
hash.aset("wrap_width", options.wrap_width as i64)?;
|
|
767
|
+
hash.aset("convert_as_inline", options.convert_as_inline)?;
|
|
768
|
+
hash.aset("sub_symbol", options.sub_symbol.clone())?;
|
|
769
|
+
hash.aset("sup_symbol", options.sup_symbol.clone())?;
|
|
770
|
+
hash.aset(
|
|
771
|
+
"newline_style",
|
|
772
|
+
match options.newline_style {
|
|
773
|
+
NewlineStyle::Spaces => "spaces",
|
|
774
|
+
NewlineStyle::Backslash => "backslash",
|
|
775
|
+
},
|
|
776
|
+
)?;
|
|
777
|
+
hash.aset(
|
|
778
|
+
"code_block_style",
|
|
779
|
+
match options.code_block_style {
|
|
780
|
+
CodeBlockStyle::Indented => "indented",
|
|
781
|
+
CodeBlockStyle::Backticks => "backticks",
|
|
782
|
+
CodeBlockStyle::Tildes => "tildes",
|
|
783
|
+
},
|
|
784
|
+
)?;
|
|
785
|
+
|
|
786
|
+
let keep_inline = ruby.ary_new();
|
|
787
|
+
for tag in &options.keep_inline_images_in {
|
|
788
|
+
keep_inline.push(tag.as_str())?;
|
|
789
|
+
}
|
|
790
|
+
hash.aset("keep_inline_images_in", keep_inline)?;
|
|
791
|
+
|
|
792
|
+
hash.aset("encoding", options.encoding.clone())?;
|
|
793
|
+
hash.aset("debug", options.debug)?;
|
|
794
|
+
|
|
795
|
+
let strip_tags = ruby.ary_new();
|
|
796
|
+
for tag in &options.strip_tags {
|
|
797
|
+
strip_tags.push(tag.as_str())?;
|
|
798
|
+
}
|
|
799
|
+
hash.aset("strip_tags", strip_tags)?;
|
|
800
|
+
|
|
801
|
+
let preserve_tags = ruby.ary_new();
|
|
802
|
+
for tag in &options.preserve_tags {
|
|
803
|
+
preserve_tags.push(tag.as_str())?;
|
|
804
|
+
}
|
|
805
|
+
hash.aset("preserve_tags", preserve_tags)?;
|
|
806
|
+
|
|
807
|
+
let pre_hash = ruby.hash_new();
|
|
808
|
+
pre_hash.aset("enabled", options.preprocessing.enabled)?;
|
|
809
|
+
pre_hash.aset(
|
|
810
|
+
"preset",
|
|
811
|
+
match options.preprocessing.preset {
|
|
812
|
+
PreprocessingPreset::Minimal => "minimal",
|
|
813
|
+
PreprocessingPreset::Standard => "standard",
|
|
814
|
+
PreprocessingPreset::Aggressive => "aggressive",
|
|
815
|
+
},
|
|
816
|
+
)?;
|
|
817
|
+
pre_hash.aset("remove_navigation", options.preprocessing.remove_navigation)?;
|
|
818
|
+
pre_hash.aset("remove_forms", options.preprocessing.remove_forms)?;
|
|
819
|
+
hash.aset("preprocessing", pre_hash)?;
|
|
820
|
+
|
|
821
|
+
Ok(hash)
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
/// Parse PageConfig from Ruby Hash
|
|
825
|
+
pub fn parse_page_config(ruby: &Ruby, hash: RHash) -> Result<PageConfig, Error> {
|
|
826
|
+
let extract_pages = if let Some(val) = get_kw(ruby, hash, "extract_pages") {
|
|
827
|
+
bool::try_convert(val)?
|
|
828
|
+
} else {
|
|
829
|
+
false
|
|
830
|
+
};
|
|
831
|
+
|
|
832
|
+
let insert_page_markers = if let Some(val) = get_kw(ruby, hash, "insert_page_markers") {
|
|
833
|
+
bool::try_convert(val)?
|
|
834
|
+
} else {
|
|
835
|
+
false
|
|
836
|
+
};
|
|
837
|
+
|
|
838
|
+
let marker_format = if let Some(val) = get_kw(ruby, hash, "marker_format") {
|
|
839
|
+
String::try_convert(val)?
|
|
840
|
+
} else {
|
|
841
|
+
"\n\n<!-- PAGE {page_num} -->\n\n".to_string()
|
|
842
|
+
};
|
|
843
|
+
|
|
844
|
+
let config = PageConfig {
|
|
845
|
+
extract_pages,
|
|
846
|
+
insert_page_markers,
|
|
847
|
+
marker_format,
|
|
848
|
+
};
|
|
849
|
+
|
|
850
|
+
Ok(config)
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
/// Parse ExtractionConfig from Ruby Hash
|
|
854
|
+
pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<ExtractionConfig, Error> {
|
|
855
|
+
let mut config = ExtractionConfig::default();
|
|
856
|
+
|
|
857
|
+
if let Some(hash) = opts {
|
|
858
|
+
if let Some(val) = get_kw(ruby, hash, "use_cache") {
|
|
859
|
+
config.use_cache = bool::try_convert(val)?;
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
if let Some(val) = get_kw(ruby, hash, "enable_quality_processing") {
|
|
863
|
+
config.enable_quality_processing = bool::try_convert(val)?;
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
if let Some(val) = get_kw(ruby, hash, "force_ocr") {
|
|
867
|
+
config.force_ocr = bool::try_convert(val)?;
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
if let Some(val) = get_kw(ruby, hash, "ocr")
|
|
871
|
+
&& !val.is_nil()
|
|
872
|
+
{
|
|
873
|
+
let ocr_hash = RHash::try_convert(val)?;
|
|
874
|
+
config.ocr = Some(parse_ocr_config(ruby, ocr_hash)?);
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
if let Some(val) = get_kw(ruby, hash, "chunking")
|
|
878
|
+
&& !val.is_nil()
|
|
879
|
+
{
|
|
880
|
+
let chunking_hash = RHash::try_convert(val)?;
|
|
881
|
+
config.chunking = Some(parse_chunking_config(ruby, chunking_hash)?);
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
if let Some(val) = get_kw(ruby, hash, "language_detection")
|
|
885
|
+
&& !val.is_nil()
|
|
886
|
+
{
|
|
887
|
+
let lang_hash = RHash::try_convert(val)?;
|
|
888
|
+
config.language_detection = Some(parse_language_detection_config(ruby, lang_hash)?);
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
if let Some(val) = get_kw(ruby, hash, "pdf_options")
|
|
892
|
+
&& !val.is_nil()
|
|
893
|
+
{
|
|
894
|
+
let pdf_hash = RHash::try_convert(val)?;
|
|
895
|
+
config.pdf_options = Some(parse_pdf_config(ruby, pdf_hash)?);
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
if let Some(val) = get_kw(ruby, hash, "images")
|
|
899
|
+
&& !val.is_nil()
|
|
900
|
+
{
|
|
901
|
+
let images_hash = RHash::try_convert(val)?;
|
|
902
|
+
config.images = Some(parse_image_extraction_config(ruby, images_hash)?);
|
|
903
|
+
}
|
|
904
|
+
|
|
905
|
+
if let Some(val) = get_kw(ruby, hash, "postprocessor")
|
|
906
|
+
&& !val.is_nil()
|
|
907
|
+
{
|
|
908
|
+
let postprocessor_hash = RHash::try_convert(val)?;
|
|
909
|
+
config.postprocessor = Some(parse_postprocessor_config(ruby, postprocessor_hash)?);
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
if let Some(val) = get_kw(ruby, hash, "token_reduction")
|
|
913
|
+
&& !val.is_nil()
|
|
914
|
+
{
|
|
915
|
+
let token_reduction_hash = RHash::try_convert(val)?;
|
|
916
|
+
config.token_reduction = Some(parse_token_reduction_config(ruby, token_reduction_hash)?);
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
if let Some(val) = get_kw(ruby, hash, "keywords")
|
|
920
|
+
&& !val.is_nil()
|
|
921
|
+
{
|
|
922
|
+
let keywords_hash = RHash::try_convert(val)?;
|
|
923
|
+
config.keywords = Some(parse_keyword_config(ruby, keywords_hash)?);
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
if let Some(val) = get_kw(ruby, hash, "html_options")
|
|
927
|
+
&& !val.is_nil()
|
|
928
|
+
{
|
|
929
|
+
let html_hash = RHash::try_convert(val)?;
|
|
930
|
+
config.html_options = Some(parse_html_options(ruby, html_hash)?);
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
if let Some(val) = get_kw(ruby, hash, "pages")
|
|
934
|
+
&& !val.is_nil()
|
|
935
|
+
{
|
|
936
|
+
let pages_hash = RHash::try_convert(val)?;
|
|
937
|
+
config.pages = Some(parse_page_config(ruby, pages_hash)?);
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
if let Some(val) = get_kw(ruby, hash, "max_concurrent_extractions") {
|
|
941
|
+
let value = usize::try_convert(val)?;
|
|
942
|
+
config.max_concurrent_extractions = Some(value);
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
if let Some(val) = get_kw(ruby, hash, "result_format") {
|
|
946
|
+
let format_str = String::try_convert(val)?;
|
|
947
|
+
config.result_format = match format_str.as_str() {
|
|
948
|
+
"unified" | "Unified" => kreuzberg::types::OutputFormat::Unified,
|
|
949
|
+
"element_based" | "ElementBased" | "elements" => kreuzberg::types::OutputFormat::ElementBased,
|
|
950
|
+
_ => {
|
|
951
|
+
return Err(runtime_error(format!(
|
|
952
|
+
"Invalid result_format: '{}'. Expected 'unified' or 'element_based'",
|
|
953
|
+
format_str
|
|
954
|
+
)))
|
|
955
|
+
}
|
|
956
|
+
};
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
if let Some(val) = get_kw(ruby, hash, "output_format") {
|
|
960
|
+
let format_str = String::try_convert(val)?;
|
|
961
|
+
config.output_format = match format_str.as_str() {
|
|
962
|
+
"plain" | "Plain" => OutputFormat::Plain,
|
|
963
|
+
"markdown" | "Markdown" => OutputFormat::Markdown,
|
|
964
|
+
"djot" | "Djot" => OutputFormat::Djot,
|
|
965
|
+
"html" | "Html" => OutputFormat::Html,
|
|
966
|
+
_ => {
|
|
967
|
+
return Err(runtime_error(format!(
|
|
968
|
+
"Invalid output_format: '{}'. Expected 'plain', 'markdown', 'djot', or 'html'",
|
|
969
|
+
format_str
|
|
970
|
+
)))
|
|
971
|
+
}
|
|
972
|
+
};
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
Ok(config)
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
/// Load extraction config from file
|
|
980
|
+
///
|
|
981
|
+
/// Supports TOML, YAML, and JSON file formats. The format is detected from the file extension.
|
|
982
|
+
pub fn config_from_file(path: String) -> Result<RHash, Error> {
|
|
983
|
+
use std::path::Path;
|
|
984
|
+
|
|
985
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
986
|
+
let file_path = Path::new(&path);
|
|
987
|
+
|
|
988
|
+
let content = fs::read_to_string(&path)
|
|
989
|
+
.map_err(|e| validation_error(format!("Failed to read config file '{}': {}", path, e)))?;
|
|
990
|
+
|
|
991
|
+
// Detect file format from extension
|
|
992
|
+
let extension = file_path
|
|
993
|
+
.extension()
|
|
994
|
+
.and_then(|ext| ext.to_str())
|
|
995
|
+
.map(|s| s.to_lowercase());
|
|
996
|
+
|
|
997
|
+
let json_value: serde_json::Value = match extension.as_deref() {
|
|
998
|
+
Some("toml") => {
|
|
999
|
+
toml::from_str(&content)
|
|
1000
|
+
.map_err(|e| validation_error(format!("Invalid TOML in config file '{}': {}", path, e)))?
|
|
1001
|
+
}
|
|
1002
|
+
Some("yaml") | Some("yml") => {
|
|
1003
|
+
serde_yaml_ng::from_str(&content)
|
|
1004
|
+
.map_err(|e| validation_error(format!("Invalid YAML in config file '{}': {}", path, e)))?
|
|
1005
|
+
}
|
|
1006
|
+
Some("json") => {
|
|
1007
|
+
serde_json::from_str(&content)
|
|
1008
|
+
.map_err(|e| validation_error(format!("Invalid JSON in config file '{}': {}", path, e)))?
|
|
1009
|
+
}
|
|
1010
|
+
Some(ext) => {
|
|
1011
|
+
return Err(validation_error(format!(
|
|
1012
|
+
"Unsupported config file format: .{}. Supported formats: .toml, .yaml, .yml, .json",
|
|
1013
|
+
ext
|
|
1014
|
+
)));
|
|
1015
|
+
}
|
|
1016
|
+
None => {
|
|
1017
|
+
return Err(validation_error(format!(
|
|
1018
|
+
"Cannot determine file format: no extension found in '{}'",
|
|
1019
|
+
path
|
|
1020
|
+
)));
|
|
1021
|
+
}
|
|
1022
|
+
};
|
|
1023
|
+
|
|
1024
|
+
json_value_to_ruby(&ruby, &json_value)
|
|
1025
|
+
.and_then(|v| magnus::RHash::try_convert(v).map_err(|_| validation_error("Config must be a Hash")))
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
/// Discover extraction config from current directory
|
|
1029
|
+
pub fn config_discover() -> Result<Value, Error> {
|
|
1030
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1031
|
+
|
|
1032
|
+
// Search for config files in order of precedence
|
|
1033
|
+
let config_files = vec![
|
|
1034
|
+
("kreuzberg.toml", "toml"),
|
|
1035
|
+
("kreuzberg.yaml", "yaml"),
|
|
1036
|
+
("kreuzberg.yml", "yaml"),
|
|
1037
|
+
("kreuzberg.json", "json"),
|
|
1038
|
+
(".kreuzbergrc", "json"),
|
|
1039
|
+
];
|
|
1040
|
+
|
|
1041
|
+
for (name, format) in config_files {
|
|
1042
|
+
if let Ok(content) = fs::read_to_string(name) {
|
|
1043
|
+
let json_value: serde_json::Value = match format {
|
|
1044
|
+
"toml" => toml::from_str(&content)
|
|
1045
|
+
.map_err(|e| validation_error(format!("Invalid TOML in {}: {}", name, e)))?,
|
|
1046
|
+
"yaml" => serde_yaml_ng::from_str(&content)
|
|
1047
|
+
.map_err(|e| validation_error(format!("Invalid YAML in {}: {}", name, e)))?,
|
|
1048
|
+
"json" => serde_json::from_str(&content)
|
|
1049
|
+
.map_err(|e| validation_error(format!("Invalid JSON in {}: {}", name, e)))?,
|
|
1050
|
+
_ => unreachable!(),
|
|
1051
|
+
};
|
|
1052
|
+
return json_value_to_ruby(&ruby, &json_value);
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
// Return nil if no config found
|
|
1057
|
+
Ok(ruby.qnil().as_value())
|
|
1058
|
+
}
|