kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
#![allow(unpredictable_function_pointer_comparisons)]
|
|
2
|
-
|
|
3
1
|
//! Kreuzberg Ruby Bindings (Magnus)
|
|
4
2
|
//!
|
|
5
3
|
//! High-performance document intelligence framework bindings for Ruby.
|
|
@@ -9,59 +7,23 @@ use html_to_markdown_rs::options::{
|
|
|
9
7
|
CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingPreset,
|
|
10
8
|
WhitespaceMode,
|
|
11
9
|
};
|
|
12
|
-
use kreuzberg::core::config::PageConfig;
|
|
13
10
|
use kreuzberg::keywords::{
|
|
14
11
|
KeywordAlgorithm as RustKeywordAlgorithm, KeywordConfig as RustKeywordConfig, RakeParams as RustRakeParams,
|
|
15
12
|
YakeParams as RustYakeParams,
|
|
16
13
|
};
|
|
17
14
|
use kreuzberg::types::TesseractConfig as RustTesseractConfig;
|
|
18
|
-
use kreuzberg::pdf::HierarchyConfig;
|
|
19
15
|
use kreuzberg::{
|
|
20
|
-
ChunkingConfig, EmbeddingConfig, ExtractionConfig, ExtractionResult as RustExtractionResult,
|
|
21
|
-
|
|
22
|
-
|
|
16
|
+
ChunkingConfig, EmbeddingConfig, ExtractionConfig, ExtractionResult as RustExtractionResult, ImageExtractionConfig,
|
|
17
|
+
ImagePreprocessingConfig, KreuzbergError, LanguageDetectionConfig, OcrConfig, PdfConfig, PostProcessorConfig,
|
|
18
|
+
TokenReductionConfig,
|
|
23
19
|
};
|
|
24
20
|
use magnus::exception::ExceptionClass;
|
|
25
21
|
use magnus::r_hash::ForEach;
|
|
26
22
|
use magnus::value::ReprValue;
|
|
27
|
-
use magnus::{
|
|
28
|
-
Error, IntoValue, RArray, RHash, RString, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args,
|
|
29
|
-
};
|
|
23
|
+
use magnus::{Error, IntoValue, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
|
|
30
24
|
use std::fs;
|
|
31
25
|
use std::path::{Path, PathBuf};
|
|
32
26
|
|
|
33
|
-
// Re-export FFI types and functions from kreuzberg_ffi crate.
|
|
34
|
-
// This ensures proper linking by importing Rust symbols directly
|
|
35
|
-
// instead of declaring them as external C symbols.
|
|
36
|
-
pub use kreuzberg_ffi::{
|
|
37
|
-
// Types
|
|
38
|
-
CErrorDetails, CMetadataField,
|
|
39
|
-
// Panic/error handling (from panic_shield module)
|
|
40
|
-
get_last_error_code, get_last_error_message, get_last_panic_context,
|
|
41
|
-
// Error functions (from error module)
|
|
42
|
-
kreuzberg_get_error_details, kreuzberg_classify_error,
|
|
43
|
-
kreuzberg_error_code_name, kreuzberg_error_code_description,
|
|
44
|
-
// Result functions (from result module)
|
|
45
|
-
kreuzberg_result_get_page_count, kreuzberg_result_get_chunk_count,
|
|
46
|
-
kreuzberg_result_get_detected_language, kreuzberg_result_get_metadata_field,
|
|
47
|
-
// Memory and util functions (from lib.rs)
|
|
48
|
-
kreuzberg_free_string, kreuzberg_last_error, kreuzberg_last_error_code,
|
|
49
|
-
kreuzberg_last_panic_context,
|
|
50
|
-
// Validation functions (from lib.rs)
|
|
51
|
-
kreuzberg_validate_binarization_method, kreuzberg_validate_ocr_backend,
|
|
52
|
-
kreuzberg_validate_language_code, kreuzberg_validate_token_reduction_level,
|
|
53
|
-
kreuzberg_validate_tesseract_psm, kreuzberg_validate_tesseract_oem,
|
|
54
|
-
kreuzberg_validate_output_format, kreuzberg_validate_confidence,
|
|
55
|
-
kreuzberg_validate_dpi, kreuzberg_validate_chunking_params,
|
|
56
|
-
kreuzberg_get_valid_binarization_methods, kreuzberg_get_valid_language_codes,
|
|
57
|
-
kreuzberg_get_valid_ocr_backends, kreuzberg_get_valid_token_reduction_levels,
|
|
58
|
-
// Config functions (from config module, now re-exported through lib.rs)
|
|
59
|
-
kreuzberg_config_from_json, kreuzberg_config_free, kreuzberg_config_is_valid,
|
|
60
|
-
kreuzberg_config_to_json, kreuzberg_config_get_field, kreuzberg_config_merge,
|
|
61
|
-
};
|
|
62
|
-
|
|
63
|
-
use std::ffi::c_char;
|
|
64
|
-
|
|
65
27
|
/// Keeps Ruby values alive across plugin registrations by informing the GC.
|
|
66
28
|
struct GcGuardedValue {
|
|
67
29
|
value: Value,
|
|
@@ -87,27 +49,6 @@ impl Drop for GcGuardedValue {
|
|
|
87
49
|
}
|
|
88
50
|
}
|
|
89
51
|
|
|
90
|
-
/// Retrieve panic context from FFI if available
|
|
91
|
-
fn get_panic_context() -> Option<String> {
|
|
92
|
-
unsafe {
|
|
93
|
-
let ctx_ptr = kreuzberg_last_panic_context();
|
|
94
|
-
if ctx_ptr.is_null() {
|
|
95
|
-
return None;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
let c_str = std::ffi::CStr::from_ptr(ctx_ptr);
|
|
99
|
-
let context = c_str.to_string_lossy().to_string();
|
|
100
|
-
kreuzberg_free_string(ctx_ptr as *mut std::ffi::c_char);
|
|
101
|
-
|
|
102
|
-
if context.is_empty() { None } else { Some(context) }
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
/// Retrieve error code from FFI
|
|
107
|
-
fn get_error_code() -> i32 {
|
|
108
|
-
unsafe { kreuzberg_last_error_code() }
|
|
109
|
-
}
|
|
110
|
-
|
|
111
52
|
/// Convert Kreuzberg errors to Ruby exceptions
|
|
112
53
|
fn kreuzberg_error(err: KreuzbergError) -> Error {
|
|
113
54
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
@@ -314,10 +255,10 @@ fn ruby_value_to_json(value: Value) -> Result<serde_json::Value, Error> {
|
|
|
314
255
|
return Ok(serde_json::Value::Number(serde_json::Number::from(unsigned)));
|
|
315
256
|
}
|
|
316
257
|
|
|
317
|
-
if let Ok(float) = f64::try_convert(value)
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
258
|
+
if let Ok(float) = f64::try_convert(value) {
|
|
259
|
+
if let Some(num) = serde_json::Number::from_f64(float) {
|
|
260
|
+
return Ok(serde_json::Value::Number(num));
|
|
261
|
+
}
|
|
321
262
|
}
|
|
322
263
|
|
|
323
264
|
if let Ok(sym) = Symbol::try_convert(value) {
|
|
@@ -455,46 +396,6 @@ fn parse_language_detection_config(ruby: &Ruby, hash: RHash) -> Result<LanguageD
|
|
|
455
396
|
Ok(config)
|
|
456
397
|
}
|
|
457
398
|
|
|
458
|
-
/// Parse HierarchyConfig from Ruby Hash
|
|
459
|
-
fn parse_hierarchy_config(ruby: &Ruby, hash: RHash) -> Result<HierarchyConfig, Error> {
|
|
460
|
-
let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
|
|
461
|
-
bool::try_convert(val)?
|
|
462
|
-
} else {
|
|
463
|
-
true
|
|
464
|
-
};
|
|
465
|
-
|
|
466
|
-
let k_clusters = if let Some(val) = get_kw(ruby, hash, "k_clusters") {
|
|
467
|
-
usize::try_convert(val)?
|
|
468
|
-
} else {
|
|
469
|
-
6
|
|
470
|
-
};
|
|
471
|
-
|
|
472
|
-
let include_bbox = if let Some(val) = get_kw(ruby, hash, "include_bbox") {
|
|
473
|
-
bool::try_convert(val)?
|
|
474
|
-
} else {
|
|
475
|
-
true
|
|
476
|
-
};
|
|
477
|
-
|
|
478
|
-
let ocr_coverage_threshold = if let Some(val) = get_kw(ruby, hash, "ocr_coverage_threshold") {
|
|
479
|
-
if !val.is_nil() {
|
|
480
|
-
Some(f64::try_convert(val)? as f32)
|
|
481
|
-
} else {
|
|
482
|
-
None
|
|
483
|
-
}
|
|
484
|
-
} else {
|
|
485
|
-
None
|
|
486
|
-
};
|
|
487
|
-
|
|
488
|
-
let config = HierarchyConfig {
|
|
489
|
-
enabled,
|
|
490
|
-
k_clusters,
|
|
491
|
-
include_bbox,
|
|
492
|
-
ocr_coverage_threshold,
|
|
493
|
-
};
|
|
494
|
-
|
|
495
|
-
Ok(config)
|
|
496
|
-
}
|
|
497
|
-
|
|
498
399
|
/// Parse PdfConfig from Ruby Hash
|
|
499
400
|
fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
|
|
500
401
|
let extract_images = if let Some(val) = get_kw(ruby, hash, "extract_images") {
|
|
@@ -520,22 +421,10 @@ fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
|
|
|
520
421
|
true
|
|
521
422
|
};
|
|
522
423
|
|
|
523
|
-
let hierarchy = if let Some(val) = get_kw(ruby, hash, "hierarchy") {
|
|
524
|
-
if !val.is_nil() {
|
|
525
|
-
let h_hash = RHash::try_convert(val)?;
|
|
526
|
-
Some(parse_hierarchy_config(ruby, h_hash)?)
|
|
527
|
-
} else {
|
|
528
|
-
None
|
|
529
|
-
}
|
|
530
|
-
} else {
|
|
531
|
-
None
|
|
532
|
-
};
|
|
533
|
-
|
|
534
424
|
let config = PdfConfig {
|
|
535
425
|
extract_images,
|
|
536
426
|
passwords,
|
|
537
427
|
extract_metadata,
|
|
538
|
-
hierarchy,
|
|
539
428
|
};
|
|
540
429
|
|
|
541
430
|
Ok(config)
|
|
@@ -682,8 +571,6 @@ fn parse_postprocessor_config(ruby: &Ruby, hash: RHash) -> Result<PostProcessorC
|
|
|
682
571
|
enabled,
|
|
683
572
|
enabled_processors,
|
|
684
573
|
disabled_processors,
|
|
685
|
-
enabled_set: None,
|
|
686
|
-
disabled_set: None,
|
|
687
574
|
};
|
|
688
575
|
|
|
689
576
|
Ok(config)
|
|
@@ -746,10 +633,10 @@ fn parse_keyword_config(ruby: &Ruby, hash: RHash) -> Result<RustKeywordConfig, E
|
|
|
746
633
|
}
|
|
747
634
|
}
|
|
748
635
|
|
|
749
|
-
if let Some(val) = get_kw(ruby, hash, "language")
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
636
|
+
if let Some(val) = get_kw(ruby, hash, "language") {
|
|
637
|
+
if !val.is_nil() {
|
|
638
|
+
config.language = Some(symbol_to_string(val)?);
|
|
639
|
+
}
|
|
753
640
|
}
|
|
754
641
|
|
|
755
642
|
if let Some(val) = get_kw(ruby, hash, "yake_params")
|
|
@@ -1136,36 +1023,6 @@ fn html_options_to_ruby_hash(ruby: &Ruby, options: &ConversionOptions) -> Result
|
|
|
1136
1023
|
|
|
1137
1024
|
Ok(hash)
|
|
1138
1025
|
}
|
|
1139
|
-
|
|
1140
|
-
/// Parse PageConfig from Ruby Hash
|
|
1141
|
-
fn parse_page_config(ruby: &Ruby, hash: RHash) -> Result<PageConfig, Error> {
|
|
1142
|
-
let extract_pages = if let Some(val) = get_kw(ruby, hash, "extract_pages") {
|
|
1143
|
-
bool::try_convert(val)?
|
|
1144
|
-
} else {
|
|
1145
|
-
false
|
|
1146
|
-
};
|
|
1147
|
-
|
|
1148
|
-
let insert_page_markers = if let Some(val) = get_kw(ruby, hash, "insert_page_markers") {
|
|
1149
|
-
bool::try_convert(val)?
|
|
1150
|
-
} else {
|
|
1151
|
-
false
|
|
1152
|
-
};
|
|
1153
|
-
|
|
1154
|
-
let marker_format = if let Some(val) = get_kw(ruby, hash, "marker_format") {
|
|
1155
|
-
String::try_convert(val)?
|
|
1156
|
-
} else {
|
|
1157
|
-
"\n\n<!-- PAGE {page_num} -->\n\n".to_string()
|
|
1158
|
-
};
|
|
1159
|
-
|
|
1160
|
-
let config = PageConfig {
|
|
1161
|
-
extract_pages,
|
|
1162
|
-
insert_page_markers,
|
|
1163
|
-
marker_format,
|
|
1164
|
-
};
|
|
1165
|
-
|
|
1166
|
-
Ok(config)
|
|
1167
|
-
}
|
|
1168
|
-
|
|
1169
1026
|
/// Parse ExtractionConfig from Ruby Hash
|
|
1170
1027
|
fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<ExtractionConfig, Error> {
|
|
1171
1028
|
let mut config = ExtractionConfig::default();
|
|
@@ -1246,13 +1103,6 @@ fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extractio
|
|
|
1246
1103
|
config.html_options = Some(parse_html_options(ruby, html_hash)?);
|
|
1247
1104
|
}
|
|
1248
1105
|
|
|
1249
|
-
if let Some(val) = get_kw(ruby, hash, "pages")
|
|
1250
|
-
&& !val.is_nil()
|
|
1251
|
-
{
|
|
1252
|
-
let pages_hash = RHash::try_convert(val)?;
|
|
1253
|
-
config.pages = Some(parse_page_config(ruby, pages_hash)?);
|
|
1254
|
-
}
|
|
1255
|
-
|
|
1256
1106
|
if let Some(val) = get_kw(ruby, hash, "max_concurrent_extractions") {
|
|
1257
1107
|
let value = usize::try_convert(val)?;
|
|
1258
1108
|
config.max_concurrent_extractions = Some(value);
|
|
@@ -1655,8 +1505,8 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
|
|
|
1655
1505
|
for chunk in chunks {
|
|
1656
1506
|
let chunk_hash = ruby.hash_new();
|
|
1657
1507
|
chunk_hash.aset("content", chunk.content)?;
|
|
1658
|
-
chunk_hash.aset("
|
|
1659
|
-
chunk_hash.aset("
|
|
1508
|
+
chunk_hash.aset("char_start", chunk.metadata.char_start)?;
|
|
1509
|
+
chunk_hash.aset("char_end", chunk.metadata.char_end)?;
|
|
1660
1510
|
if let Some(token_count) = chunk.metadata.token_count {
|
|
1661
1511
|
chunk_hash.aset("token_count", token_count)?;
|
|
1662
1512
|
} else {
|
|
@@ -1664,16 +1514,6 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
|
|
|
1664
1514
|
}
|
|
1665
1515
|
chunk_hash.aset("chunk_index", chunk.metadata.chunk_index)?;
|
|
1666
1516
|
chunk_hash.aset("total_chunks", chunk.metadata.total_chunks)?;
|
|
1667
|
-
if let Some(first_page) = chunk.metadata.first_page {
|
|
1668
|
-
chunk_hash.aset("first_page", first_page as i64)?;
|
|
1669
|
-
} else {
|
|
1670
|
-
chunk_hash.aset("first_page", ruby.qnil().as_value())?;
|
|
1671
|
-
}
|
|
1672
|
-
if let Some(last_page) = chunk.metadata.last_page {
|
|
1673
|
-
chunk_hash.aset("last_page", last_page as i64)?;
|
|
1674
|
-
} else {
|
|
1675
|
-
chunk_hash.aset("last_page", ruby.qnil().as_value())?;
|
|
1676
|
-
}
|
|
1677
1517
|
if let Some(embedding) = chunk.embedding {
|
|
1678
1518
|
let embedding_array = ruby.ary_new();
|
|
1679
1519
|
for value in embedding {
|
|
@@ -1750,92 +1590,6 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
|
|
|
1750
1590
|
set_hash_entry(ruby, &hash, "images", ruby.qnil().as_value())?;
|
|
1751
1591
|
}
|
|
1752
1592
|
|
|
1753
|
-
if let Some(page_content_list) = result.pages {
|
|
1754
|
-
let pages_array = ruby.ary_new();
|
|
1755
|
-
for page_content in page_content_list {
|
|
1756
|
-
let page_hash = ruby.hash_new();
|
|
1757
|
-
page_hash.aset("page_number", page_content.page_number as i64)?;
|
|
1758
|
-
page_hash.aset("content", page_content.content)?;
|
|
1759
|
-
|
|
1760
|
-
let tables_array = ruby.ary_new();
|
|
1761
|
-
for table in page_content.tables {
|
|
1762
|
-
let table_hash = ruby.hash_new();
|
|
1763
|
-
|
|
1764
|
-
let cells_array = ruby.ary_new();
|
|
1765
|
-
for row in table.cells.clone() {
|
|
1766
|
-
let row_array = ruby.ary_from_vec(row);
|
|
1767
|
-
cells_array.push(row_array)?;
|
|
1768
|
-
}
|
|
1769
|
-
table_hash.aset("cells", cells_array)?;
|
|
1770
|
-
table_hash.aset("markdown", table.markdown.clone())?;
|
|
1771
|
-
table_hash.aset("page_number", table.page_number as i64)?;
|
|
1772
|
-
|
|
1773
|
-
tables_array.push(table_hash)?;
|
|
1774
|
-
}
|
|
1775
|
-
page_hash.aset("tables", tables_array)?;
|
|
1776
|
-
|
|
1777
|
-
let images_array = ruby.ary_new();
|
|
1778
|
-
for image in page_content.images {
|
|
1779
|
-
let image_hash = ruby.hash_new();
|
|
1780
|
-
let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
|
|
1781
|
-
image_hash.aset("data", data_value)?;
|
|
1782
|
-
image_hash.aset("format", image.format.clone())?;
|
|
1783
|
-
image_hash.aset("image_index", image.image_index as i64)?;
|
|
1784
|
-
if let Some(page) = image.page_number {
|
|
1785
|
-
image_hash.aset("page_number", page as i64)?;
|
|
1786
|
-
} else {
|
|
1787
|
-
image_hash.aset("page_number", ruby.qnil().as_value())?;
|
|
1788
|
-
}
|
|
1789
|
-
if let Some(width) = image.width {
|
|
1790
|
-
image_hash.aset("width", width as i64)?;
|
|
1791
|
-
} else {
|
|
1792
|
-
image_hash.aset("width", ruby.qnil().as_value())?;
|
|
1793
|
-
}
|
|
1794
|
-
if let Some(height) = image.height {
|
|
1795
|
-
image_hash.aset("height", height as i64)?;
|
|
1796
|
-
} else {
|
|
1797
|
-
image_hash.aset("height", ruby.qnil().as_value())?;
|
|
1798
|
-
}
|
|
1799
|
-
if let Some(colorspace) = &image.colorspace {
|
|
1800
|
-
image_hash.aset("colorspace", colorspace.clone())?;
|
|
1801
|
-
} else {
|
|
1802
|
-
image_hash.aset("colorspace", ruby.qnil().as_value())?;
|
|
1803
|
-
}
|
|
1804
|
-
if let Some(bits) = image.bits_per_component {
|
|
1805
|
-
image_hash.aset("bits_per_component", bits as i64)?;
|
|
1806
|
-
} else {
|
|
1807
|
-
image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
|
|
1808
|
-
}
|
|
1809
|
-
image_hash.aset(
|
|
1810
|
-
"is_mask",
|
|
1811
|
-
if image.is_mask {
|
|
1812
|
-
ruby.qtrue().as_value()
|
|
1813
|
-
} else {
|
|
1814
|
-
ruby.qfalse().as_value()
|
|
1815
|
-
},
|
|
1816
|
-
)?;
|
|
1817
|
-
if let Some(description) = &image.description {
|
|
1818
|
-
image_hash.aset("description", description.clone())?;
|
|
1819
|
-
} else {
|
|
1820
|
-
image_hash.aset("description", ruby.qnil().as_value())?;
|
|
1821
|
-
}
|
|
1822
|
-
if let Some(ocr_result) = &image.ocr_result {
|
|
1823
|
-
let nested = extraction_result_to_ruby(ruby, (**ocr_result).clone())?;
|
|
1824
|
-
image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
|
|
1825
|
-
} else {
|
|
1826
|
-
image_hash.aset("ocr_result", ruby.qnil().as_value())?;
|
|
1827
|
-
}
|
|
1828
|
-
images_array.push(image_hash)?;
|
|
1829
|
-
}
|
|
1830
|
-
page_hash.aset("images", images_array)?;
|
|
1831
|
-
|
|
1832
|
-
pages_array.push(page_hash)?;
|
|
1833
|
-
}
|
|
1834
|
-
set_hash_entry(ruby, &hash, "pages", pages_array.into_value_with(ruby))?;
|
|
1835
|
-
} else {
|
|
1836
|
-
set_hash_entry(ruby, &hash, "pages", ruby.qnil().as_value())?;
|
|
1837
|
-
}
|
|
1838
|
-
|
|
1839
1593
|
Ok(hash)
|
|
1840
1594
|
}
|
|
1841
1595
|
|
|
@@ -1880,14 +1634,13 @@ fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
|
1880
1634
|
///
|
|
1881
1635
|
fn extract_bytes_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
1882
1636
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1883
|
-
let args = scan_args::<(
|
|
1637
|
+
let args = scan_args::<(String, String), (), (), (), RHash, ()>(args)?;
|
|
1884
1638
|
let (data, mime_type) = args.required;
|
|
1885
1639
|
let opts = Some(args.keywords);
|
|
1886
1640
|
|
|
1887
1641
|
let config = parse_extraction_config(&ruby, opts)?;
|
|
1888
1642
|
|
|
1889
|
-
let
|
|
1890
|
-
let result = kreuzberg::extract_bytes_sync(bytes, &mime_type, &config).map_err(kreuzberg_error)?;
|
|
1643
|
+
let result = kreuzberg::extract_bytes_sync(data.as_bytes(), &mime_type, &config).map_err(kreuzberg_error)?;
|
|
1891
1644
|
|
|
1892
1645
|
extraction_result_to_ruby(&ruby, result)
|
|
1893
1646
|
}
|
|
@@ -1961,7 +1714,7 @@ fn extract_file(args: &[Value]) -> Result<RHash, Error> {
|
|
|
1961
1714
|
///
|
|
1962
1715
|
fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
|
|
1963
1716
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1964
|
-
let args = scan_args::<(
|
|
1717
|
+
let args = scan_args::<(String, String), (), (), (), RHash, ()>(args)?;
|
|
1965
1718
|
let (data, mime_type) = args.required;
|
|
1966
1719
|
let opts = Some(args.keywords);
|
|
1967
1720
|
|
|
@@ -1970,9 +1723,8 @@ fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
|
|
|
1970
1723
|
let runtime =
|
|
1971
1724
|
tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
|
|
1972
1725
|
|
|
1973
|
-
let bytes = unsafe { data.as_slice() };
|
|
1974
1726
|
let result = runtime
|
|
1975
|
-
.block_on(async { kreuzberg::extract_bytes(
|
|
1727
|
+
.block_on(async { kreuzberg::extract_bytes(data.as_bytes(), &mime_type, &config).await })
|
|
1976
1728
|
.map_err(kreuzberg_error)?;
|
|
1977
1729
|
|
|
1978
1730
|
extraction_result_to_ruby(&ruby, result)
|
|
@@ -2029,10 +1781,7 @@ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
|
|
|
2029
1781
|
|
|
2030
1782
|
let config = parse_extraction_config(&ruby, opts)?;
|
|
2031
1783
|
|
|
2032
|
-
let bytes_vec: Vec<
|
|
2033
|
-
.into_iter()
|
|
2034
|
-
.map(RString::try_convert)
|
|
2035
|
-
.collect::<Result<_, _>>()?;
|
|
1784
|
+
let bytes_vec: Vec<String> = bytes_array.to_vec::<String>()?;
|
|
2036
1785
|
let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
|
|
2037
1786
|
|
|
2038
1787
|
if bytes_vec.len() != mime_types.len() {
|
|
@@ -2043,10 +1792,10 @@ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
|
|
|
2043
1792
|
)));
|
|
2044
1793
|
}
|
|
2045
1794
|
|
|
2046
|
-
let contents: Vec<(
|
|
1795
|
+
let contents: Vec<(&[u8], &str)> = bytes_vec
|
|
2047
1796
|
.iter()
|
|
2048
1797
|
.zip(mime_types.iter())
|
|
2049
|
-
.map(|(bytes, mime)| (
|
|
1798
|
+
.map(|(bytes, mime)| (bytes.as_bytes(), mime.as_str()))
|
|
2050
1799
|
.collect();
|
|
2051
1800
|
|
|
2052
1801
|
let results = kreuzberg::batch_extract_bytes_sync(contents, &config).map_err(kreuzberg_error)?;
|
|
@@ -2074,10 +1823,7 @@ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
|
|
|
2074
1823
|
|
|
2075
1824
|
let config = parse_extraction_config(&ruby, opts)?;
|
|
2076
1825
|
|
|
2077
|
-
let bytes_vec: Vec<
|
|
2078
|
-
.into_iter()
|
|
2079
|
-
.map(RString::try_convert)
|
|
2080
|
-
.collect::<Result<_, _>>()?;
|
|
1826
|
+
let bytes_vec: Vec<String> = bytes_array.to_vec::<String>()?;
|
|
2081
1827
|
let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
|
|
2082
1828
|
|
|
2083
1829
|
if bytes_vec.len() != mime_types.len() {
|
|
@@ -2088,10 +1834,10 @@ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
|
|
|
2088
1834
|
)));
|
|
2089
1835
|
}
|
|
2090
1836
|
|
|
2091
|
-
let contents: Vec<(
|
|
1837
|
+
let contents: Vec<(&[u8], &str)> = bytes_vec
|
|
2092
1838
|
.iter()
|
|
2093
1839
|
.zip(mime_types.iter())
|
|
2094
|
-
.map(|(bytes, mime)| (
|
|
1840
|
+
.map(|(bytes, mime)| (bytes.as_bytes(), mime.as_str()))
|
|
2095
1841
|
.collect();
|
|
2096
1842
|
|
|
2097
1843
|
let runtime =
|
|
@@ -2251,6 +1997,9 @@ fn register_post_processor(args: &[Value]) -> Result<(), Error> {
|
|
|
2251
1997
|
let processor = self.processor.value();
|
|
2252
1998
|
let result_clone = result.clone();
|
|
2253
1999
|
|
|
2000
|
+
// Use block_in_place to avoid GVL deadlocks (same pattern as Python PostProcessor)
|
|
2001
|
+
// See crates/kreuzberg-py/README.md:151-158 for explanation
|
|
2002
|
+
// CRITICAL: spawn_blocking causes GVL deadlocks, must use block_in_place
|
|
2254
2003
|
let updated_result = tokio::task::block_in_place(|| {
|
|
2255
2004
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
2256
2005
|
let result_hash = extraction_result_to_ruby(&ruby, result_clone.clone()).map_err(|e| {
|
|
@@ -2457,6 +2206,9 @@ fn register_validator(args: &[Value]) -> Result<(), Error> {
|
|
|
2457
2206
|
let validator = self.validator.value();
|
|
2458
2207
|
let result_clone = result.clone();
|
|
2459
2208
|
|
|
2209
|
+
// Use block_in_place to avoid GVL deadlocks (same pattern as Python Validator)
|
|
2210
|
+
// See crates/kreuzberg-py/README.md:151-158 for explanation
|
|
2211
|
+
// CRITICAL: spawn_blocking causes GVL deadlocks, must use block_in_place
|
|
2460
2212
|
tokio::task::block_in_place(|| {
|
|
2461
2213
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
2462
2214
|
let result_hash =
|
|
@@ -2593,7 +2345,6 @@ fn register_ocr_backend(name: String, backend: Value) -> Result<(), Error> {
|
|
|
2593
2345
|
detected_languages: None,
|
|
2594
2346
|
chunks: None,
|
|
2595
2347
|
images: None,
|
|
2596
|
-
pages: None,
|
|
2597
2348
|
})
|
|
2598
2349
|
}
|
|
2599
2350
|
|
|
@@ -2864,7 +2615,6 @@ fn get_extensions_for_mime_native(mime_type: String) -> Result<Vec<String>, Erro
|
|
|
2864
2615
|
kreuzberg::get_extensions_for_mime(&mime_type).map_err(kreuzberg_error)
|
|
2865
2616
|
}
|
|
2866
2617
|
|
|
2867
|
-
#[cfg(feature = "embeddings")]
|
|
2868
2618
|
/// List all available embedding preset names.
|
|
2869
2619
|
///
|
|
2870
2620
|
/// Returns an array of preset names that can be used with get_embedding_preset.
|
|
@@ -2890,7 +2640,6 @@ fn list_embedding_presets(ruby: &Ruby) -> Result<RArray, Error> {
|
|
|
2890
2640
|
Ok(array)
|
|
2891
2641
|
}
|
|
2892
2642
|
|
|
2893
|
-
#[cfg(feature = "embeddings")]
|
|
2894
2643
|
/// Get a specific embedding preset by name.
|
|
2895
2644
|
///
|
|
2896
2645
|
/// Returns a preset configuration hash, or nil if the preset name is not found.
|
|
@@ -2931,6 +2680,8 @@ fn get_embedding_preset(ruby: &Ruby, name: String) -> Result<Value, Error> {
|
|
|
2931
2680
|
set_hash_entry(ruby, &hash, "chunk_size", preset.chunk_size.into_value_with(ruby))?;
|
|
2932
2681
|
set_hash_entry(ruby, &hash, "overlap", preset.overlap.into_value_with(ruby))?;
|
|
2933
2682
|
|
|
2683
|
+
// Note: When embeddings feature is enabled in kreuzberg, the model field is EmbeddingModel
|
|
2684
|
+
// Since Ruby bindings typically build with all features, we use the model field and format it.
|
|
2934
2685
|
let model_name = format!("{:?}", preset.model);
|
|
2935
2686
|
|
|
2936
2687
|
set_hash_entry(ruby, &hash, "model_name", ruby.str_new(&model_name).as_value())?;
|
|
@@ -2943,562 +2694,6 @@ fn get_embedding_preset(ruby: &Ruby, name: String) -> Result<Value, Error> {
|
|
|
2943
2694
|
}
|
|
2944
2695
|
}
|
|
2945
2696
|
|
|
2946
|
-
/// Get the last error code from FFI
|
|
2947
|
-
///
|
|
2948
|
-
/// Returns an i32 error code indicating the type of error that occurred:
|
|
2949
|
-
/// - 0: Success (no error)
|
|
2950
|
-
/// - 1: GenericError
|
|
2951
|
-
/// - 2: Panic
|
|
2952
|
-
/// - 3: InvalidArgument
|
|
2953
|
-
/// - 4: IoError
|
|
2954
|
-
/// - 5: ParsingError
|
|
2955
|
-
/// - 6: OcrError
|
|
2956
|
-
/// - 7: MissingDependency
|
|
2957
|
-
///
|
|
2958
|
-
/// @return [Integer] The error code
|
|
2959
|
-
fn last_error_code() -> i32 {
|
|
2960
|
-
get_error_code()
|
|
2961
|
-
}
|
|
2962
|
-
|
|
2963
|
-
/// Get the last panic context from FFI as a JSON string
|
|
2964
|
-
///
|
|
2965
|
-
/// Returns a JSON string containing panic context if the last error was a panic,
|
|
2966
|
-
/// or nil if no panic context is available.
|
|
2967
|
-
///
|
|
2968
|
-
/// The JSON structure contains:
|
|
2969
|
-
/// - file: Source file where panic occurred
|
|
2970
|
-
/// - line: Line number
|
|
2971
|
-
/// - function: Function name
|
|
2972
|
-
/// - message: Panic message
|
|
2973
|
-
/// - timestamp_secs: Unix timestamp
|
|
2974
|
-
///
|
|
2975
|
-
/// @return [String, nil] JSON string with panic context or nil
|
|
2976
|
-
fn last_panic_context_json(ruby: &Ruby) -> Value {
|
|
2977
|
-
match get_panic_context() {
|
|
2978
|
-
Some(json) => ruby.str_new(&json).as_value(),
|
|
2979
|
-
None => ruby.qnil().as_value(),
|
|
2980
|
-
}
|
|
2981
|
-
}
|
|
2982
|
-
|
|
2983
|
-
/// Validates a binarization method string
|
|
2984
|
-
///
|
|
2985
|
-
/// @param method [String] The binarization method (e.g., "otsu", "adaptive", "sauvola")
|
|
2986
|
-
/// @return [Integer] 1 if valid, 0 if invalid (error message available via Kreuzberg::_last_error_code_native)
|
|
2987
|
-
fn validate_binarization_method(method: String) -> Result<i32, Error> {
|
|
2988
|
-
let c_method = std::ffi::CString::new(method).map_err(|_| runtime_error("Invalid method string"))?;
|
|
2989
|
-
|
|
2990
|
-
Ok(unsafe { kreuzberg_validate_binarization_method(c_method.as_ptr()) })
|
|
2991
|
-
}
|
|
2992
|
-
|
|
2993
|
-
/// Validates an OCR backend string
|
|
2994
|
-
///
|
|
2995
|
-
/// @param backend [String] The OCR backend (e.g., "tesseract", "easyocr", "paddleocr")
|
|
2996
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
2997
|
-
fn validate_ocr_backend(backend: String) -> Result<i32, Error> {
|
|
2998
|
-
let c_backend = std::ffi::CString::new(backend).map_err(|_| runtime_error("Invalid backend string"))?;
|
|
2999
|
-
|
|
3000
|
-
Ok(unsafe { kreuzberg_validate_ocr_backend(c_backend.as_ptr()) })
|
|
3001
|
-
}
|
|
3002
|
-
|
|
3003
|
-
/// Validates a language code (ISO 639-1 or 639-3)
|
|
3004
|
-
///
|
|
3005
|
-
/// @param code [String] The language code (e.g., "en", "eng", "de", "deu")
|
|
3006
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3007
|
-
fn validate_language_code(code: String) -> Result<i32, Error> {
|
|
3008
|
-
let c_code = std::ffi::CString::new(code).map_err(|_| runtime_error("Invalid language code string"))?;
|
|
3009
|
-
|
|
3010
|
-
Ok(unsafe { kreuzberg_validate_language_code(c_code.as_ptr()) })
|
|
3011
|
-
}
|
|
3012
|
-
|
|
3013
|
-
/// Validates a token reduction level
|
|
3014
|
-
///
|
|
3015
|
-
/// @param level [String] The token reduction level (e.g., "off", "light", "moderate", "aggressive", "maximum")
|
|
3016
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3017
|
-
fn validate_token_reduction_level(level: String) -> Result<i32, Error> {
|
|
3018
|
-
let c_level = std::ffi::CString::new(level).map_err(|_| runtime_error("Invalid token reduction level string"))?;
|
|
3019
|
-
|
|
3020
|
-
Ok(unsafe { kreuzberg_validate_token_reduction_level(c_level.as_ptr()) })
|
|
3021
|
-
}
|
|
3022
|
-
|
|
3023
|
-
/// Validates a tesseract PSM (Page Segmentation Mode) value
|
|
3024
|
-
///
|
|
3025
|
-
/// @param psm [Integer] The PSM value (0-13)
|
|
3026
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3027
|
-
fn validate_tesseract_psm(psm: i32) -> Result<i32, Error> {
|
|
3028
|
-
Ok(kreuzberg_validate_tesseract_psm(psm))
|
|
3029
|
-
}
|
|
3030
|
-
|
|
3031
|
-
/// Validates a tesseract OEM (OCR Engine Mode) value
|
|
3032
|
-
///
|
|
3033
|
-
/// @param oem [Integer] The OEM value (0-3)
|
|
3034
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3035
|
-
fn validate_tesseract_oem(oem: i32) -> Result<i32, Error> {
|
|
3036
|
-
Ok(kreuzberg_validate_tesseract_oem(oem))
|
|
3037
|
-
}
|
|
3038
|
-
|
|
3039
|
-
/// Validates an output format string
|
|
3040
|
-
///
|
|
3041
|
-
/// @param format [String] The output format (e.g., "text", "markdown")
|
|
3042
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3043
|
-
fn validate_output_format(format: String) -> Result<i32, Error> {
|
|
3044
|
-
let c_format = std::ffi::CString::new(format).map_err(|_| runtime_error("Invalid format string"))?;
|
|
3045
|
-
|
|
3046
|
-
Ok(unsafe { kreuzberg_validate_output_format(c_format.as_ptr()) })
|
|
3047
|
-
}
|
|
3048
|
-
|
|
3049
|
-
/// Validates a confidence threshold value
|
|
3050
|
-
///
|
|
3051
|
-
/// @param confidence [Float] The confidence value (0.0-1.0)
|
|
3052
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3053
|
-
fn validate_confidence(confidence: f64) -> Result<i32, Error> {
|
|
3054
|
-
Ok(kreuzberg_validate_confidence(confidence))
|
|
3055
|
-
}
|
|
3056
|
-
|
|
3057
|
-
/// Validates a DPI (dots per inch) value
|
|
3058
|
-
///
|
|
3059
|
-
/// @param dpi [Integer] The DPI value
|
|
3060
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3061
|
-
fn validate_dpi(dpi: i32) -> Result<i32, Error> {
|
|
3062
|
-
Ok(kreuzberg_validate_dpi(dpi))
|
|
3063
|
-
}
|
|
3064
|
-
|
|
3065
|
-
/// Validates chunking parameters
|
|
3066
|
-
///
|
|
3067
|
-
/// @param max_chars [Integer] Maximum characters per chunk
|
|
3068
|
-
/// @param max_overlap [Integer] Maximum overlap between chunks
|
|
3069
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3070
|
-
fn validate_chunking_params(max_chars: usize, max_overlap: usize) -> Result<i32, Error> {
|
|
3071
|
-
Ok(kreuzberg_validate_chunking_params(max_chars, max_overlap))
|
|
3072
|
-
}
|
|
3073
|
-
|
|
3074
|
-
/// Gets valid binarization methods as a JSON string
|
|
3075
|
-
///
|
|
3076
|
-
/// @return [String] JSON array of valid binarization methods
|
|
3077
|
-
fn get_valid_binarization_methods(_ruby: &Ruby) -> Result<String, Error> {
|
|
3078
|
-
let ptr = kreuzberg_get_valid_binarization_methods();
|
|
3079
|
-
if ptr.is_null() {
|
|
3080
|
-
return Err(runtime_error("Failed to get valid binarization methods"));
|
|
3081
|
-
}
|
|
3082
|
-
|
|
3083
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(ptr) };
|
|
3084
|
-
let result = c_str
|
|
3085
|
-
.to_str()
|
|
3086
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in binarization methods"))?
|
|
3087
|
-
.to_string();
|
|
3088
|
-
|
|
3089
|
-
unsafe {
|
|
3090
|
-
kreuzberg_free_string(ptr as *mut c_char);
|
|
3091
|
-
}
|
|
3092
|
-
|
|
3093
|
-
Ok(result)
|
|
3094
|
-
}
|
|
3095
|
-
|
|
3096
|
-
/// Gets valid language codes as a JSON string
|
|
3097
|
-
///
|
|
3098
|
-
/// @return [String] JSON array of valid language codes
|
|
3099
|
-
fn get_valid_language_codes(_ruby: &Ruby) -> Result<String, Error> {
|
|
3100
|
-
let ptr = kreuzberg_get_valid_language_codes();
|
|
3101
|
-
if ptr.is_null() {
|
|
3102
|
-
return Err(runtime_error("Failed to get valid language codes"));
|
|
3103
|
-
}
|
|
3104
|
-
|
|
3105
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(ptr) };
|
|
3106
|
-
let result = c_str
|
|
3107
|
-
.to_str()
|
|
3108
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in language codes"))?
|
|
3109
|
-
.to_string();
|
|
3110
|
-
|
|
3111
|
-
unsafe {
|
|
3112
|
-
kreuzberg_free_string(ptr as *mut c_char);
|
|
3113
|
-
}
|
|
3114
|
-
|
|
3115
|
-
Ok(result)
|
|
3116
|
-
}
|
|
3117
|
-
|
|
3118
|
-
/// Gets valid OCR backends as a JSON string
|
|
3119
|
-
///
|
|
3120
|
-
/// @return [String] JSON array of valid OCR backends
|
|
3121
|
-
fn get_valid_ocr_backends(_ruby: &Ruby) -> Result<String, Error> {
|
|
3122
|
-
let ptr = kreuzberg_get_valid_ocr_backends();
|
|
3123
|
-
if ptr.is_null() {
|
|
3124
|
-
return Err(runtime_error("Failed to get valid OCR backends"));
|
|
3125
|
-
}
|
|
3126
|
-
|
|
3127
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(ptr) };
|
|
3128
|
-
let result = c_str
|
|
3129
|
-
.to_str()
|
|
3130
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in OCR backends"))?
|
|
3131
|
-
.to_string();
|
|
3132
|
-
|
|
3133
|
-
unsafe {
|
|
3134
|
-
kreuzberg_free_string(ptr as *mut c_char);
|
|
3135
|
-
}
|
|
3136
|
-
|
|
3137
|
-
Ok(result)
|
|
3138
|
-
}
|
|
3139
|
-
|
|
3140
|
-
/// Gets valid token reduction levels as a JSON string
|
|
3141
|
-
///
|
|
3142
|
-
/// @return [String] JSON array of valid token reduction levels
|
|
3143
|
-
fn get_valid_token_reduction_levels(_ruby: &Ruby) -> Result<String, Error> {
|
|
3144
|
-
let ptr = kreuzberg_get_valid_token_reduction_levels();
|
|
3145
|
-
if ptr.is_null() {
|
|
3146
|
-
return Err(runtime_error("Failed to get valid token reduction levels"));
|
|
3147
|
-
}
|
|
3148
|
-
|
|
3149
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(ptr) };
|
|
3150
|
-
let result = c_str
|
|
3151
|
-
.to_str()
|
|
3152
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in token reduction levels"))?
|
|
3153
|
-
.to_string();
|
|
3154
|
-
|
|
3155
|
-
unsafe {
|
|
3156
|
-
kreuzberg_free_string(ptr as *mut c_char);
|
|
3157
|
-
}
|
|
3158
|
-
|
|
3159
|
-
Ok(result)
|
|
3160
|
-
}
|
|
3161
|
-
|
|
3162
|
-
/// Serialize a config to JSON string
|
|
3163
|
-
/// @param config_json [String] JSON string representing the config
|
|
3164
|
-
/// @return [String] Serialized JSON config
|
|
3165
|
-
fn config_to_json_wrapper(_ruby: &Ruby, config_json: String) -> Result<String, Error> {
|
|
3166
|
-
let c_json =
|
|
3167
|
-
std::ffi::CString::new(config_json).map_err(|e| runtime_error(format!("Invalid config JSON: {}", e)))?;
|
|
3168
|
-
|
|
3169
|
-
let config_ptr = unsafe { kreuzberg_config_from_json(c_json.as_ptr()) };
|
|
3170
|
-
if config_ptr.is_null() {
|
|
3171
|
-
return Err(runtime_error("Failed to parse config from JSON"));
|
|
3172
|
-
}
|
|
3173
|
-
|
|
3174
|
-
let json_ptr = unsafe { kreuzberg_config_to_json(config_ptr) };
|
|
3175
|
-
let result = if json_ptr.is_null() {
|
|
3176
|
-
Err(runtime_error("Failed to serialize config to JSON"))
|
|
3177
|
-
} else {
|
|
3178
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(json_ptr) };
|
|
3179
|
-
let json = c_str
|
|
3180
|
-
.to_str()
|
|
3181
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in serialized config"))?
|
|
3182
|
-
.to_string();
|
|
3183
|
-
unsafe {
|
|
3184
|
-
kreuzberg_free_string(json_ptr as *mut c_char);
|
|
3185
|
-
}
|
|
3186
|
-
Ok(json)
|
|
3187
|
-
};
|
|
3188
|
-
|
|
3189
|
-
unsafe {
|
|
3190
|
-
kreuzberg_config_free(config_ptr);
|
|
3191
|
-
}
|
|
3192
|
-
result
|
|
3193
|
-
}
|
|
3194
|
-
|
|
3195
|
-
/// Get a field from config
|
|
3196
|
-
/// @param config_json [String] JSON string representing the config
|
|
3197
|
-
/// @param field_name [String] Field name (supports dot notation)
|
|
3198
|
-
/// @return [Object] Parsed JSON value, or nil if field doesn't exist
|
|
3199
|
-
fn config_get_field_wrapper(ruby: &Ruby, config_json: String, field_name: String) -> Result<Value, Error> {
|
|
3200
|
-
let c_json =
|
|
3201
|
-
std::ffi::CString::new(config_json).map_err(|e| runtime_error(format!("Invalid config JSON: {}", e)))?;
|
|
3202
|
-
let c_field =
|
|
3203
|
-
std::ffi::CString::new(field_name).map_err(|e| runtime_error(format!("Invalid field name: {}", e)))?;
|
|
3204
|
-
|
|
3205
|
-
let config_ptr = unsafe { kreuzberg_config_from_json(c_json.as_ptr()) };
|
|
3206
|
-
if config_ptr.is_null() {
|
|
3207
|
-
return Err(runtime_error("Failed to parse config from JSON"));
|
|
3208
|
-
}
|
|
3209
|
-
|
|
3210
|
-
let field_ptr = unsafe { kreuzberg_config_get_field(config_ptr, c_field.as_ptr()) };
|
|
3211
|
-
let result = if field_ptr.is_null() {
|
|
3212
|
-
Ok(ruby.qnil().as_value())
|
|
3213
|
-
} else {
|
|
3214
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(field_ptr) };
|
|
3215
|
-
let json_str = c_str
|
|
3216
|
-
.to_str()
|
|
3217
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in field value"))?;
|
|
3218
|
-
let json_value: serde_json::Value =
|
|
3219
|
-
serde_json::from_str(json_str).map_err(|e| runtime_error(format!("Failed to parse field value: {}", e)))?;
|
|
3220
|
-
unsafe {
|
|
3221
|
-
kreuzberg_free_string(field_ptr as *mut c_char);
|
|
3222
|
-
}
|
|
3223
|
-
json_value_to_ruby(ruby, &json_value)
|
|
3224
|
-
};
|
|
3225
|
-
|
|
3226
|
-
unsafe {
|
|
3227
|
-
kreuzberg_config_free(config_ptr);
|
|
3228
|
-
}
|
|
3229
|
-
result
|
|
3230
|
-
}
|
|
3231
|
-
|
|
3232
|
-
/// Merge two configs
|
|
3233
|
-
/// @param base_json [String] Base config JSON
|
|
3234
|
-
/// @param override_json [String] Override config JSON
|
|
3235
|
-
/// @return [String] Merged config JSON
|
|
3236
|
-
fn config_merge_wrapper(_ruby: &Ruby, base_json: String, override_json: String) -> Result<String, Error> {
|
|
3237
|
-
let c_base =
|
|
3238
|
-
std::ffi::CString::new(base_json).map_err(|e| runtime_error(format!("Invalid base config JSON: {}", e)))?;
|
|
3239
|
-
let c_override = std::ffi::CString::new(override_json)
|
|
3240
|
-
.map_err(|e| runtime_error(format!("Invalid override config JSON: {}", e)))?;
|
|
3241
|
-
|
|
3242
|
-
let base_ptr = unsafe { kreuzberg_config_from_json(c_base.as_ptr()) };
|
|
3243
|
-
if base_ptr.is_null() {
|
|
3244
|
-
return Err(runtime_error("Failed to parse base config from JSON"));
|
|
3245
|
-
}
|
|
3246
|
-
|
|
3247
|
-
let override_ptr = unsafe { kreuzberg_config_from_json(c_override.as_ptr()) };
|
|
3248
|
-
if override_ptr.is_null() {
|
|
3249
|
-
unsafe {
|
|
3250
|
-
kreuzberg_config_free(base_ptr);
|
|
3251
|
-
}
|
|
3252
|
-
return Err(runtime_error("Failed to parse override config from JSON"));
|
|
3253
|
-
}
|
|
3254
|
-
|
|
3255
|
-
let merge_result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
|
|
3256
|
-
|
|
3257
|
-
let result = if merge_result == 0 {
|
|
3258
|
-
Err(runtime_error("Failed to merge configs"))
|
|
3259
|
-
} else {
|
|
3260
|
-
let json_ptr = unsafe { kreuzberg_config_to_json(base_ptr) };
|
|
3261
|
-
if json_ptr.is_null() {
|
|
3262
|
-
Err(runtime_error("Failed to serialize merged config"))
|
|
3263
|
-
} else {
|
|
3264
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(json_ptr) };
|
|
3265
|
-
let json = c_str
|
|
3266
|
-
.to_str()
|
|
3267
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in merged config"))?
|
|
3268
|
-
.to_string();
|
|
3269
|
-
unsafe {
|
|
3270
|
-
kreuzberg_free_string(json_ptr as *mut c_char);
|
|
3271
|
-
}
|
|
3272
|
-
Ok(json)
|
|
3273
|
-
}
|
|
3274
|
-
};
|
|
3275
|
-
|
|
3276
|
-
unsafe {
|
|
3277
|
-
kreuzberg_config_free(base_ptr);
|
|
3278
|
-
kreuzberg_config_free(override_ptr);
|
|
3279
|
-
}
|
|
3280
|
-
result
|
|
3281
|
-
}
|
|
3282
|
-
|
|
3283
|
-
/// Get page count from result
|
|
3284
|
-
/// @param result_ptr [Integer] Opaque pointer to ExtractionResult (as integer)
|
|
3285
|
-
/// @return [Integer] Page count, or -1 on error
|
|
3286
|
-
fn result_page_count(_ruby: &Ruby, result_ptr: i64) -> Result<i32, Error> {
|
|
3287
|
-
if result_ptr == 0 {
|
|
3288
|
-
return Err(runtime_error("Invalid result pointer"));
|
|
3289
|
-
}
|
|
3290
|
-
|
|
3291
|
-
let page_count = unsafe { kreuzberg_result_get_page_count(result_ptr as *const RustExtractionResult) };
|
|
3292
|
-
|
|
3293
|
-
Ok(page_count)
|
|
3294
|
-
}
|
|
3295
|
-
|
|
3296
|
-
/// Get chunk count from result
|
|
3297
|
-
/// @param result_ptr [Integer] Opaque pointer to ExtractionResult (as integer)
|
|
3298
|
-
/// @return [Integer] Chunk count, or -1 on error
|
|
3299
|
-
fn result_chunk_count(_ruby: &Ruby, result_ptr: i64) -> Result<i32, Error> {
|
|
3300
|
-
if result_ptr == 0 {
|
|
3301
|
-
return Err(runtime_error("Invalid result pointer"));
|
|
3302
|
-
}
|
|
3303
|
-
|
|
3304
|
-
let chunk_count = unsafe { kreuzberg_result_get_chunk_count(result_ptr as *const RustExtractionResult) };
|
|
3305
|
-
|
|
3306
|
-
Ok(chunk_count)
|
|
3307
|
-
}
|
|
3308
|
-
|
|
3309
|
-
/// Get detected language from result
|
|
3310
|
-
/// @param result_ptr [Integer] Opaque pointer to ExtractionResult (as integer)
|
|
3311
|
-
/// @return [String, nil] Detected language code, or nil if not detected
|
|
3312
|
-
fn result_detected_language(_ruby: &Ruby, result_ptr: i64) -> Result<Value, Error> {
|
|
3313
|
-
if result_ptr == 0 {
|
|
3314
|
-
return Err(runtime_error("Invalid result pointer"));
|
|
3315
|
-
}
|
|
3316
|
-
|
|
3317
|
-
let lang_ptr = unsafe { kreuzberg_result_get_detected_language(result_ptr as *const RustExtractionResult) };
|
|
3318
|
-
|
|
3319
|
-
if lang_ptr.is_null() {
|
|
3320
|
-
return Ok(_ruby.qnil().as_value());
|
|
3321
|
-
}
|
|
3322
|
-
|
|
3323
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(lang_ptr) };
|
|
3324
|
-
let lang = c_str
|
|
3325
|
-
.to_str()
|
|
3326
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in detected language"))?
|
|
3327
|
-
.to_string();
|
|
3328
|
-
|
|
3329
|
-
unsafe {
|
|
3330
|
-
kreuzberg_free_string(lang_ptr as *mut c_char);
|
|
3331
|
-
}
|
|
3332
|
-
|
|
3333
|
-
Ok(_ruby.str_new(&lang).into_value_with(_ruby))
|
|
3334
|
-
}
|
|
3335
|
-
|
|
3336
|
-
/// Get metadata field from result
|
|
3337
|
-
/// @param result_ptr [Integer] Opaque pointer to ExtractionResult (as integer)
|
|
3338
|
-
/// @param field_name [String] Field name (supports dot notation)
|
|
3339
|
-
/// @return [Object, nil] Parsed JSON value, or nil if field doesn't exist
|
|
3340
|
-
fn result_metadata_field(ruby: &Ruby, result_ptr: i64, field_name: String) -> Result<Value, Error> {
|
|
3341
|
-
if result_ptr == 0 {
|
|
3342
|
-
return Err(runtime_error("Invalid result pointer"));
|
|
3343
|
-
}
|
|
3344
|
-
|
|
3345
|
-
let c_field =
|
|
3346
|
-
std::ffi::CString::new(field_name).map_err(|e| runtime_error(format!("Invalid field name: {}", e)))?;
|
|
3347
|
-
|
|
3348
|
-
let field = unsafe { kreuzberg_result_get_metadata_field(result_ptr as *const RustExtractionResult, c_field.as_ptr()) };
|
|
3349
|
-
|
|
3350
|
-
if field.is_null != 0 {
|
|
3351
|
-
return Ok(ruby.qnil().as_value());
|
|
3352
|
-
}
|
|
3353
|
-
|
|
3354
|
-
if field.json_value.is_null() {
|
|
3355
|
-
return Ok(ruby.qnil().as_value());
|
|
3356
|
-
}
|
|
3357
|
-
|
|
3358
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(field.json_value) };
|
|
3359
|
-
let json_str = c_str
|
|
3360
|
-
.to_str()
|
|
3361
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in field value"))?;
|
|
3362
|
-
let json_value: serde_json::Value =
|
|
3363
|
-
serde_json::from_str(json_str).map_err(|e| runtime_error(format!("Failed to parse field value: {}", e)))?;
|
|
3364
|
-
|
|
3365
|
-
unsafe {
|
|
3366
|
-
kreuzberg_free_string(field.json_value);
|
|
3367
|
-
}
|
|
3368
|
-
|
|
3369
|
-
json_value_to_ruby(ruby, &json_value)
|
|
3370
|
-
}
|
|
3371
|
-
|
|
3372
|
-
/// Get structured error details from FFI
|
|
3373
|
-
/// @return [Hash] Error details with keys: :message, :error_code, :error_type, :source_file, :source_function, :source_line, :context_info, :is_panic
|
|
3374
|
-
fn get_error_details_native(ruby: &Ruby) -> Result<Value, Error> {
|
|
3375
|
-
let details = kreuzberg_get_error_details();
|
|
3376
|
-
|
|
3377
|
-
let hash = ruby.hash_new();
|
|
3378
|
-
|
|
3379
|
-
unsafe {
|
|
3380
|
-
let message = if !details.message.is_null() {
|
|
3381
|
-
let c_str = std::ffi::CStr::from_ptr(details.message);
|
|
3382
|
-
let msg = c_str.to_str().unwrap_or("").to_string();
|
|
3383
|
-
kreuzberg_free_string(details.message);
|
|
3384
|
-
msg
|
|
3385
|
-
} else {
|
|
3386
|
-
String::new()
|
|
3387
|
-
};
|
|
3388
|
-
|
|
3389
|
-
let error_type = if !details.error_type.is_null() {
|
|
3390
|
-
let c_str = std::ffi::CStr::from_ptr(details.error_type);
|
|
3391
|
-
let ty = c_str.to_str().unwrap_or("unknown").to_string();
|
|
3392
|
-
kreuzberg_free_string(details.error_type);
|
|
3393
|
-
ty
|
|
3394
|
-
} else {
|
|
3395
|
-
"unknown".to_string()
|
|
3396
|
-
};
|
|
3397
|
-
|
|
3398
|
-
let source_file = if !details.source_file.is_null() {
|
|
3399
|
-
let c_str = std::ffi::CStr::from_ptr(details.source_file);
|
|
3400
|
-
let file = c_str.to_str().ok().map(|s| s.to_string());
|
|
3401
|
-
kreuzberg_free_string(details.source_file);
|
|
3402
|
-
file
|
|
3403
|
-
} else {
|
|
3404
|
-
None
|
|
3405
|
-
};
|
|
3406
|
-
|
|
3407
|
-
let source_function = if !details.source_function.is_null() {
|
|
3408
|
-
let c_str = std::ffi::CStr::from_ptr(details.source_function);
|
|
3409
|
-
let func = c_str.to_str().ok().map(|s| s.to_string());
|
|
3410
|
-
kreuzberg_free_string(details.source_function);
|
|
3411
|
-
func
|
|
3412
|
-
} else {
|
|
3413
|
-
None
|
|
3414
|
-
};
|
|
3415
|
-
|
|
3416
|
-
let context_info = if !details.context_info.is_null() {
|
|
3417
|
-
let c_str = std::ffi::CStr::from_ptr(details.context_info);
|
|
3418
|
-
let ctx = c_str.to_str().ok().map(|s| s.to_string());
|
|
3419
|
-
kreuzberg_free_string(details.context_info);
|
|
3420
|
-
ctx
|
|
3421
|
-
} else {
|
|
3422
|
-
None
|
|
3423
|
-
};
|
|
3424
|
-
|
|
3425
|
-
hash.aset(ruby.to_symbol("message"), ruby.str_new(&message).as_value())?;
|
|
3426
|
-
hash.aset(ruby.to_symbol("error_code"), details.error_code.into_value_with(ruby))?;
|
|
3427
|
-
hash.aset(ruby.to_symbol("error_type"), ruby.str_new(&error_type).as_value())?;
|
|
3428
|
-
|
|
3429
|
-
if let Some(file) = source_file {
|
|
3430
|
-
hash.aset(ruby.to_symbol("source_file"), ruby.str_new(&file).as_value())?;
|
|
3431
|
-
} else {
|
|
3432
|
-
hash.aset(ruby.to_symbol("source_file"), ruby.qnil().as_value())?;
|
|
3433
|
-
}
|
|
3434
|
-
|
|
3435
|
-
if let Some(func) = source_function {
|
|
3436
|
-
hash.aset(ruby.to_symbol("source_function"), ruby.str_new(&func).as_value())?;
|
|
3437
|
-
} else {
|
|
3438
|
-
hash.aset(ruby.to_symbol("source_function"), ruby.qnil().as_value())?;
|
|
3439
|
-
}
|
|
3440
|
-
|
|
3441
|
-
hash.aset(ruby.to_symbol("source_line"), details.source_line.into_value_with(ruby))?;
|
|
3442
|
-
|
|
3443
|
-
if let Some(ctx) = context_info {
|
|
3444
|
-
hash.aset(ruby.to_symbol("context_info"), ruby.str_new(&ctx).as_value())?;
|
|
3445
|
-
} else {
|
|
3446
|
-
hash.aset(ruby.to_symbol("context_info"), ruby.qnil().as_value())?;
|
|
3447
|
-
}
|
|
3448
|
-
|
|
3449
|
-
hash.aset(
|
|
3450
|
-
ruby.to_symbol("is_panic"),
|
|
3451
|
-
(details.is_panic != 0).into_value_with(ruby),
|
|
3452
|
-
)?;
|
|
3453
|
-
}
|
|
3454
|
-
|
|
3455
|
-
Ok(hash.into_value_with(ruby))
|
|
3456
|
-
}
|
|
3457
|
-
|
|
3458
|
-
/// Classify an error based on an error message string
|
|
3459
|
-
/// @param message [String] The error message to classify
|
|
3460
|
-
/// @return [Integer] Error code (0-7)
|
|
3461
|
-
fn classify_error_native(ruby: &Ruby, message: String) -> Result<Value, Error> {
|
|
3462
|
-
let c_message =
|
|
3463
|
-
std::ffi::CString::new(message).map_err(|e| runtime_error(format!("Invalid error message: {}", e)))?;
|
|
3464
|
-
|
|
3465
|
-
let code = unsafe { kreuzberg_classify_error(c_message.as_ptr()) };
|
|
3466
|
-
|
|
3467
|
-
Ok(code.into_value_with(ruby))
|
|
3468
|
-
}
|
|
3469
|
-
|
|
3470
|
-
/// Get the human-readable name of an error code
|
|
3471
|
-
/// @param code [Integer] Numeric error code (0-7)
|
|
3472
|
-
/// @return [String] Human-readable error code name
|
|
3473
|
-
fn error_code_name_native(ruby: &Ruby, code: u32) -> Result<Value, Error> {
|
|
3474
|
-
let name_ptr = kreuzberg_error_code_name(code);
|
|
3475
|
-
|
|
3476
|
-
if name_ptr.is_null() {
|
|
3477
|
-
return Ok(ruby.str_new("unknown").as_value());
|
|
3478
|
-
}
|
|
3479
|
-
|
|
3480
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(name_ptr) };
|
|
3481
|
-
let name = c_str.to_str().unwrap_or("unknown").to_string();
|
|
3482
|
-
|
|
3483
|
-
Ok(ruby.str_new(&name).as_value())
|
|
3484
|
-
}
|
|
3485
|
-
|
|
3486
|
-
/// Get the description of an error code
|
|
3487
|
-
/// @param code [Integer] Numeric error code (0-7)
|
|
3488
|
-
/// @return [String] Description of the error code
|
|
3489
|
-
fn error_code_description_native(ruby: &Ruby, code: u32) -> Result<Value, Error> {
|
|
3490
|
-
let desc_ptr = kreuzberg_error_code_description(code);
|
|
3491
|
-
|
|
3492
|
-
if desc_ptr.is_null() {
|
|
3493
|
-
return Ok(ruby.str_new("Unknown error code").as_value());
|
|
3494
|
-
}
|
|
3495
|
-
|
|
3496
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(desc_ptr) };
|
|
3497
|
-
let desc = c_str.to_str().unwrap_or("Unknown error code").to_string();
|
|
3498
|
-
|
|
3499
|
-
Ok(ruby.str_new(&desc).as_value())
|
|
3500
|
-
}
|
|
3501
|
-
|
|
3502
2697
|
/// Initialize the Kreuzberg Ruby module
|
|
3503
2698
|
#[magnus::init]
|
|
3504
2699
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
@@ -3547,66 +2742,8 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
3547
2742
|
module.define_module_function("get_extensions_for_mime", function!(get_extensions_for_mime_native, 1))?;
|
|
3548
2743
|
module.define_module_function("validate_mime_type", function!(validate_mime_type_native, 1))?;
|
|
3549
2744
|
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
module.define_module_function("list_embedding_presets", function!(list_embedding_presets, 0))?;
|
|
3553
|
-
module.define_module_function("get_embedding_preset", function!(get_embedding_preset, 1))?;
|
|
3554
|
-
}
|
|
3555
|
-
|
|
3556
|
-
module.define_module_function("_last_error_code_native", function!(last_error_code, 0))?;
|
|
3557
|
-
module.define_module_function("_last_panic_context_json_native", function!(last_panic_context_json, 0))?;
|
|
3558
|
-
|
|
3559
|
-
module.define_module_function(
|
|
3560
|
-
"_validate_binarization_method_native",
|
|
3561
|
-
function!(validate_binarization_method, 1),
|
|
3562
|
-
)?;
|
|
3563
|
-
module.define_module_function("_validate_ocr_backend_native", function!(validate_ocr_backend, 1))?;
|
|
3564
|
-
module.define_module_function("_validate_language_code_native", function!(validate_language_code, 1))?;
|
|
3565
|
-
module.define_module_function(
|
|
3566
|
-
"_validate_token_reduction_level_native",
|
|
3567
|
-
function!(validate_token_reduction_level, 1),
|
|
3568
|
-
)?;
|
|
3569
|
-
module.define_module_function("_validate_tesseract_psm_native", function!(validate_tesseract_psm, 1))?;
|
|
3570
|
-
module.define_module_function("_validate_tesseract_oem_native", function!(validate_tesseract_oem, 1))?;
|
|
3571
|
-
module.define_module_function("_validate_output_format_native", function!(validate_output_format, 1))?;
|
|
3572
|
-
module.define_module_function("_validate_confidence_native", function!(validate_confidence, 1))?;
|
|
3573
|
-
module.define_module_function("_validate_dpi_native", function!(validate_dpi, 1))?;
|
|
3574
|
-
module.define_module_function(
|
|
3575
|
-
"_validate_chunking_params_native",
|
|
3576
|
-
function!(validate_chunking_params, 2),
|
|
3577
|
-
)?;
|
|
3578
|
-
module.define_module_function(
|
|
3579
|
-
"_get_valid_binarization_methods_native",
|
|
3580
|
-
function!(get_valid_binarization_methods, 0),
|
|
3581
|
-
)?;
|
|
3582
|
-
module.define_module_function(
|
|
3583
|
-
"_get_valid_language_codes_native",
|
|
3584
|
-
function!(get_valid_language_codes, 0),
|
|
3585
|
-
)?;
|
|
3586
|
-
module.define_module_function("_get_valid_ocr_backends_native", function!(get_valid_ocr_backends, 0))?;
|
|
3587
|
-
module.define_module_function(
|
|
3588
|
-
"_get_valid_token_reduction_levels_native",
|
|
3589
|
-
function!(get_valid_token_reduction_levels, 0),
|
|
3590
|
-
)?;
|
|
3591
|
-
|
|
3592
|
-
module.define_module_function("_config_to_json_native", function!(config_to_json_wrapper, 1))?;
|
|
3593
|
-
module.define_module_function("_config_get_field_native", function!(config_get_field_wrapper, 2))?;
|
|
3594
|
-
module.define_module_function("_config_merge_native", function!(config_merge_wrapper, 2))?;
|
|
3595
|
-
module.define_module_function("_result_page_count_native", function!(result_page_count, 1))?;
|
|
3596
|
-
module.define_module_function("_result_chunk_count_native", function!(result_chunk_count, 1))?;
|
|
3597
|
-
module.define_module_function(
|
|
3598
|
-
"_result_detected_language_native",
|
|
3599
|
-
function!(result_detected_language, 1),
|
|
3600
|
-
)?;
|
|
3601
|
-
module.define_module_function("_result_metadata_field_native", function!(result_metadata_field, 2))?;
|
|
3602
|
-
|
|
3603
|
-
module.define_module_function("_get_error_details_native", function!(get_error_details_native, 0))?;
|
|
3604
|
-
module.define_module_function("_classify_error_native", function!(classify_error_native, 1))?;
|
|
3605
|
-
module.define_module_function("_error_code_name_native", function!(error_code_name_native, 1))?;
|
|
3606
|
-
module.define_module_function(
|
|
3607
|
-
"_error_code_description_native",
|
|
3608
|
-
function!(error_code_description_native, 1),
|
|
3609
|
-
)?;
|
|
2745
|
+
module.define_module_function("list_embedding_presets", function!(list_embedding_presets, 0))?;
|
|
2746
|
+
module.define_module_function("get_embedding_preset", function!(get_embedding_preset, 1))?;
|
|
3610
2747
|
|
|
3611
2748
|
Ok(())
|
|
3612
2749
|
}
|