kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -1,848 +0,0 @@
|
|
|
1
|
-
//! Configuration validation FFI module.
|
|
2
|
-
//!
|
|
3
|
-
//! Exposes validation functions from `kreuzberg::core::config_validation` through C FFI,
|
|
4
|
-
//! allowing all language bindings (Python, Java, TypeScript, Ruby, Go, C#) to validate
|
|
5
|
-
//! configuration values without duplicating validation logic.
|
|
6
|
-
//!
|
|
7
|
-
//! # Return Values
|
|
8
|
-
//!
|
|
9
|
-
//! All validator functions return:
|
|
10
|
-
//! - `1` if the value is valid
|
|
11
|
-
//! - `0` if the value is invalid (with error message set via `set_last_error()`)
|
|
12
|
-
//!
|
|
13
|
-
//! # String Functions
|
|
14
|
-
//!
|
|
15
|
-
//! List functions return JSON-encoded strings as `*mut c_char` that MUST be freed by the caller
|
|
16
|
-
//! using `kreuzberg_free_string()`.
|
|
17
|
-
//!
|
|
18
|
-
//! # Examples (in C)
|
|
19
|
-
//!
|
|
20
|
-
//! ```c
|
|
21
|
-
//! // Validate a string parameter
|
|
22
|
-
//! if (kreuzberg_validate_binarization_method("otsu") == 1) {
|
|
23
|
-
//! printf("Valid!\n");
|
|
24
|
-
//! } else {
|
|
25
|
-
//! printf("Error: %s\n", kreuzberg_get_last_error_message());
|
|
26
|
-
//! }
|
|
27
|
-
//!
|
|
28
|
-
//! // Get valid options
|
|
29
|
-
//! char* methods = kreuzberg_get_valid_binarization_methods();
|
|
30
|
-
//! // Use it...
|
|
31
|
-
//! kreuzberg_free_string(methods);
|
|
32
|
-
//! ```
|
|
33
|
-
|
|
34
|
-
use std::ffi::CStr;
|
|
35
|
-
use std::os::raw::c_char;
|
|
36
|
-
|
|
37
|
-
use kreuzberg::core::config_validation::{
|
|
38
|
-
validate_binarization_method, validate_chunking_params, validate_confidence, validate_dpi, validate_language_code,
|
|
39
|
-
validate_ocr_backend, validate_output_format, validate_tesseract_oem, validate_tesseract_psm,
|
|
40
|
-
validate_token_reduction_level,
|
|
41
|
-
};
|
|
42
|
-
|
|
43
|
-
use crate::set_last_error;
|
|
44
|
-
|
|
45
|
-
const VALID_BINARIZATION_METHODS: &[&str] = &["otsu", "adaptive", "sauvola"];
|
|
46
|
-
const VALID_TOKEN_REDUCTION_LEVELS: &[&str] = &["off", "light", "moderate", "aggressive", "maximum"];
|
|
47
|
-
const VALID_OCR_BACKENDS: &[&str] = &["tesseract", "easyocr", "paddleocr"];
|
|
48
|
-
const VALID_LANGUAGE_CODES: &[&str] = &[
|
|
49
|
-
"en", "de", "fr", "es", "it", "pt", "nl", "pl", "ru", "zh", "ja", "ko", "bg", "cs", "da", "el", "et", "fi", "hu",
|
|
50
|
-
"lt", "lv", "ro", "sk", "sl", "sv", "uk", "ar", "hi", "th", "tr", "vi", "eng", "deu", "fra", "spa", "ita", "por",
|
|
51
|
-
"nld", "pol", "rus", "zho", "jpn", "kor", "ces", "dan", "ell", "est", "fin", "hun", "lit", "lav", "ron", "slk",
|
|
52
|
-
"slv", "swe", "tur",
|
|
53
|
-
];
|
|
54
|
-
|
|
55
|
-
/// Validates a binarization method string.
|
|
56
|
-
///
|
|
57
|
-
/// # Arguments
|
|
58
|
-
///
|
|
59
|
-
/// * `method` - C string containing the binarization method (e.g., "otsu", "adaptive", "sauvola")
|
|
60
|
-
///
|
|
61
|
-
/// # Returns
|
|
62
|
-
///
|
|
63
|
-
/// - `1` if valid
|
|
64
|
-
/// - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
65
|
-
///
|
|
66
|
-
/// # Safety
|
|
67
|
-
///
|
|
68
|
-
/// * `method` must be a valid pointer to a null-terminated UTF-8 string
|
|
69
|
-
/// * `method` cannot be NULL
|
|
70
|
-
/// * The string must be valid for the duration of the call
|
|
71
|
-
///
|
|
72
|
-
/// # C Signature
|
|
73
|
-
///
|
|
74
|
-
/// ```c
|
|
75
|
-
/// int32_t kreuzberg_validate_binarization_method(const char* method);
|
|
76
|
-
/// ```
|
|
77
|
-
#[unsafe(no_mangle)]
|
|
78
|
-
pub unsafe extern "C" fn kreuzberg_validate_binarization_method(method: *const c_char) -> i32 {
|
|
79
|
-
if method.is_null() {
|
|
80
|
-
set_last_error("method cannot be NULL".to_string());
|
|
81
|
-
return 0;
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
let method_str = match unsafe { CStr::from_ptr(method) }.to_str() {
|
|
85
|
-
Ok(s) => s,
|
|
86
|
-
Err(_) => {
|
|
87
|
-
set_last_error("Invalid UTF-8 in method".to_string());
|
|
88
|
-
return 0;
|
|
89
|
-
}
|
|
90
|
-
};
|
|
91
|
-
|
|
92
|
-
match validate_binarization_method(method_str) {
|
|
93
|
-
Ok(()) => 1,
|
|
94
|
-
Err(e) => {
|
|
95
|
-
set_last_error(e.to_string());
|
|
96
|
-
0
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
/// Validates an OCR backend string.
|
|
102
|
-
///
|
|
103
|
-
/// # Arguments
|
|
104
|
-
///
|
|
105
|
-
/// * `backend` - C string containing the OCR backend (e.g., "tesseract", "easyocr", "paddleocr")
|
|
106
|
-
///
|
|
107
|
-
/// # Returns
|
|
108
|
-
///
|
|
109
|
-
/// - `1` if valid
|
|
110
|
-
/// - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
111
|
-
///
|
|
112
|
-
/// # Safety
|
|
113
|
-
///
|
|
114
|
-
/// * `backend` must be a valid pointer to a null-terminated UTF-8 string
|
|
115
|
-
/// * `backend` cannot be NULL
|
|
116
|
-
/// * The string must be valid for the duration of the call
|
|
117
|
-
///
|
|
118
|
-
/// # C Signature
|
|
119
|
-
///
|
|
120
|
-
/// ```c
|
|
121
|
-
/// int32_t kreuzberg_validate_ocr_backend(const char* backend);
|
|
122
|
-
/// ```
|
|
123
|
-
#[unsafe(no_mangle)]
|
|
124
|
-
pub unsafe extern "C" fn kreuzberg_validate_ocr_backend(backend: *const c_char) -> i32 {
|
|
125
|
-
if backend.is_null() {
|
|
126
|
-
set_last_error("backend cannot be NULL".to_string());
|
|
127
|
-
return 0;
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
let backend_str = match unsafe { CStr::from_ptr(backend) }.to_str() {
|
|
131
|
-
Ok(s) => s,
|
|
132
|
-
Err(_) => {
|
|
133
|
-
set_last_error("Invalid UTF-8 in backend".to_string());
|
|
134
|
-
return 0;
|
|
135
|
-
}
|
|
136
|
-
};
|
|
137
|
-
|
|
138
|
-
match validate_ocr_backend(backend_str) {
|
|
139
|
-
Ok(()) => 1,
|
|
140
|
-
Err(e) => {
|
|
141
|
-
set_last_error(e.to_string());
|
|
142
|
-
0
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
/// Validates a language code (ISO 639-1 or 639-3 format).
|
|
148
|
-
///
|
|
149
|
-
/// Accepts both 2-letter codes (e.g., "en", "de") and 3-letter codes (e.g., "eng", "deu").
|
|
150
|
-
///
|
|
151
|
-
/// # Arguments
|
|
152
|
-
///
|
|
153
|
-
/// * `code` - C string containing the language code
|
|
154
|
-
///
|
|
155
|
-
/// # Returns
|
|
156
|
-
///
|
|
157
|
-
/// - `1` if valid
|
|
158
|
-
/// - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
159
|
-
///
|
|
160
|
-
/// # Safety
|
|
161
|
-
///
|
|
162
|
-
/// * `code` must be a valid pointer to a null-terminated UTF-8 string
|
|
163
|
-
/// * `code` cannot be NULL
|
|
164
|
-
/// * The string must be valid for the duration of the call
|
|
165
|
-
///
|
|
166
|
-
/// # C Signature
|
|
167
|
-
///
|
|
168
|
-
/// ```c
|
|
169
|
-
/// int32_t kreuzberg_validate_language_code(const char* code);
|
|
170
|
-
/// ```
|
|
171
|
-
#[unsafe(no_mangle)]
|
|
172
|
-
pub unsafe extern "C" fn kreuzberg_validate_language_code(code: *const c_char) -> i32 {
|
|
173
|
-
if code.is_null() {
|
|
174
|
-
set_last_error("code cannot be NULL".to_string());
|
|
175
|
-
return 0;
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
let code_str = match unsafe { CStr::from_ptr(code) }.to_str() {
|
|
179
|
-
Ok(s) => s,
|
|
180
|
-
Err(_) => {
|
|
181
|
-
set_last_error("Invalid UTF-8 in code".to_string());
|
|
182
|
-
return 0;
|
|
183
|
-
}
|
|
184
|
-
};
|
|
185
|
-
|
|
186
|
-
match validate_language_code(code_str) {
|
|
187
|
-
Ok(()) => 1,
|
|
188
|
-
Err(e) => {
|
|
189
|
-
set_last_error(e.to_string());
|
|
190
|
-
0
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
/// Validates a token reduction level string.
|
|
196
|
-
///
|
|
197
|
-
/// # Arguments
|
|
198
|
-
///
|
|
199
|
-
/// * `level` - C string containing the token reduction level (e.g., "off", "light", "moderate")
|
|
200
|
-
///
|
|
201
|
-
/// # Returns
|
|
202
|
-
///
|
|
203
|
-
/// - `1` if valid
|
|
204
|
-
/// - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
205
|
-
///
|
|
206
|
-
/// # Safety
|
|
207
|
-
///
|
|
208
|
-
/// * `level` must be a valid pointer to a null-terminated UTF-8 string
|
|
209
|
-
/// * `level` cannot be NULL
|
|
210
|
-
/// * The string must be valid for the duration of the call
|
|
211
|
-
///
|
|
212
|
-
/// # C Signature
|
|
213
|
-
///
|
|
214
|
-
/// ```c
|
|
215
|
-
/// int32_t kreuzberg_validate_token_reduction_level(const char* level);
|
|
216
|
-
/// ```
|
|
217
|
-
#[unsafe(no_mangle)]
|
|
218
|
-
pub unsafe extern "C" fn kreuzberg_validate_token_reduction_level(level: *const c_char) -> i32 {
|
|
219
|
-
if level.is_null() {
|
|
220
|
-
set_last_error("level cannot be NULL".to_string());
|
|
221
|
-
return 0;
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
let level_str = match unsafe { CStr::from_ptr(level) }.to_str() {
|
|
225
|
-
Ok(s) => s,
|
|
226
|
-
Err(_) => {
|
|
227
|
-
set_last_error("Invalid UTF-8 in level".to_string());
|
|
228
|
-
return 0;
|
|
229
|
-
}
|
|
230
|
-
};
|
|
231
|
-
|
|
232
|
-
match validate_token_reduction_level(level_str) {
|
|
233
|
-
Ok(()) => 1,
|
|
234
|
-
Err(e) => {
|
|
235
|
-
set_last_error(e.to_string());
|
|
236
|
-
0
|
|
237
|
-
}
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
/// Validates a tesseract Page Segmentation Mode (PSM) value.
|
|
242
|
-
///
|
|
243
|
-
/// # Arguments
|
|
244
|
-
///
|
|
245
|
-
/// * `psm` - PSM value (valid range: 0-13)
|
|
246
|
-
///
|
|
247
|
-
/// # Returns
|
|
248
|
-
///
|
|
249
|
-
/// - `1` if valid
|
|
250
|
-
/// - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
251
|
-
///
|
|
252
|
-
/// # C Signature
|
|
253
|
-
///
|
|
254
|
-
/// ```c
|
|
255
|
-
/// int32_t kreuzberg_validate_tesseract_psm(int32_t psm);
|
|
256
|
-
/// ```
|
|
257
|
-
#[unsafe(no_mangle)]
|
|
258
|
-
pub extern "C" fn kreuzberg_validate_tesseract_psm(psm: i32) -> i32 {
|
|
259
|
-
match validate_tesseract_psm(psm) {
|
|
260
|
-
Ok(()) => 1,
|
|
261
|
-
Err(e) => {
|
|
262
|
-
set_last_error(e.to_string());
|
|
263
|
-
0
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
/// Validates a tesseract OCR Engine Mode (OEM) value.
|
|
269
|
-
///
|
|
270
|
-
/// # Arguments
|
|
271
|
-
///
|
|
272
|
-
/// * `oem` - OEM value (valid range: 0-3)
|
|
273
|
-
///
|
|
274
|
-
/// # Returns
|
|
275
|
-
///
|
|
276
|
-
/// - `1` if valid
|
|
277
|
-
/// - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
278
|
-
///
|
|
279
|
-
/// # C Signature
|
|
280
|
-
///
|
|
281
|
-
/// ```c
|
|
282
|
-
/// int32_t kreuzberg_validate_tesseract_oem(int32_t oem);
|
|
283
|
-
/// ```
|
|
284
|
-
#[unsafe(no_mangle)]
|
|
285
|
-
pub extern "C" fn kreuzberg_validate_tesseract_oem(oem: i32) -> i32 {
|
|
286
|
-
match validate_tesseract_oem(oem) {
|
|
287
|
-
Ok(()) => 1,
|
|
288
|
-
Err(e) => {
|
|
289
|
-
set_last_error(e.to_string());
|
|
290
|
-
0
|
|
291
|
-
}
|
|
292
|
-
}
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
/// Validates a tesseract output format string.
|
|
296
|
-
///
|
|
297
|
-
/// # Arguments
|
|
298
|
-
///
|
|
299
|
-
/// * `format` - C string containing the output format (e.g., "text", "markdown")
|
|
300
|
-
///
|
|
301
|
-
/// # Returns
|
|
302
|
-
///
|
|
303
|
-
/// - `1` if valid
|
|
304
|
-
/// - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
305
|
-
///
|
|
306
|
-
/// # Safety
|
|
307
|
-
///
|
|
308
|
-
/// * `format` must be a valid pointer to a null-terminated UTF-8 string
|
|
309
|
-
/// * `format` cannot be NULL
|
|
310
|
-
/// * The string must be valid for the duration of the call
|
|
311
|
-
///
|
|
312
|
-
/// # C Signature
|
|
313
|
-
///
|
|
314
|
-
/// ```c
|
|
315
|
-
/// int32_t kreuzberg_validate_output_format(const char* format);
|
|
316
|
-
/// ```
|
|
317
|
-
#[unsafe(no_mangle)]
|
|
318
|
-
pub unsafe extern "C" fn kreuzberg_validate_output_format(format: *const c_char) -> i32 {
|
|
319
|
-
if format.is_null() {
|
|
320
|
-
set_last_error("format cannot be NULL".to_string());
|
|
321
|
-
return 0;
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
let format_str = match unsafe { CStr::from_ptr(format) }.to_str() {
|
|
325
|
-
Ok(s) => s,
|
|
326
|
-
Err(_) => {
|
|
327
|
-
set_last_error("Invalid UTF-8 in format".to_string());
|
|
328
|
-
return 0;
|
|
329
|
-
}
|
|
330
|
-
};
|
|
331
|
-
|
|
332
|
-
match validate_output_format(format_str) {
|
|
333
|
-
Ok(()) => 1,
|
|
334
|
-
Err(e) => {
|
|
335
|
-
set_last_error(e.to_string());
|
|
336
|
-
0
|
|
337
|
-
}
|
|
338
|
-
}
|
|
339
|
-
}
|
|
340
|
-
|
|
341
|
-
/// Validates a confidence threshold value.
|
|
342
|
-
///
|
|
343
|
-
/// Confidence thresholds must be between 0.0 and 1.0 inclusive.
|
|
344
|
-
///
|
|
345
|
-
/// # Arguments
|
|
346
|
-
///
|
|
347
|
-
/// * `confidence` - Confidence threshold value
|
|
348
|
-
///
|
|
349
|
-
/// # Returns
|
|
350
|
-
///
|
|
351
|
-
/// - `1` if valid
|
|
352
|
-
/// - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
353
|
-
///
|
|
354
|
-
/// # C Signature
|
|
355
|
-
///
|
|
356
|
-
/// ```c
|
|
357
|
-
/// int32_t kreuzberg_validate_confidence(double confidence);
|
|
358
|
-
/// ```
|
|
359
|
-
#[unsafe(no_mangle)]
|
|
360
|
-
pub extern "C" fn kreuzberg_validate_confidence(confidence: f64) -> i32 {
|
|
361
|
-
match validate_confidence(confidence) {
|
|
362
|
-
Ok(()) => 1,
|
|
363
|
-
Err(e) => {
|
|
364
|
-
set_last_error(e.to_string());
|
|
365
|
-
0
|
|
366
|
-
}
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
/// Validates a DPI (dots per inch) value.
|
|
371
|
-
///
|
|
372
|
-
/// DPI must be a positive integer, typically 72-600.
|
|
373
|
-
///
|
|
374
|
-
/// # Arguments
|
|
375
|
-
///
|
|
376
|
-
/// * `dpi` - DPI value
|
|
377
|
-
///
|
|
378
|
-
/// # Returns
|
|
379
|
-
///
|
|
380
|
-
/// - `1` if valid
|
|
381
|
-
/// - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
382
|
-
///
|
|
383
|
-
/// # C Signature
|
|
384
|
-
///
|
|
385
|
-
/// ```c
|
|
386
|
-
/// int32_t kreuzberg_validate_dpi(int32_t dpi);
|
|
387
|
-
/// ```
|
|
388
|
-
#[unsafe(no_mangle)]
|
|
389
|
-
pub extern "C" fn kreuzberg_validate_dpi(dpi: i32) -> i32 {
|
|
390
|
-
match validate_dpi(dpi) {
|
|
391
|
-
Ok(()) => 1,
|
|
392
|
-
Err(e) => {
|
|
393
|
-
set_last_error(e.to_string());
|
|
394
|
-
0
|
|
395
|
-
}
|
|
396
|
-
}
|
|
397
|
-
}
|
|
398
|
-
|
|
399
|
-
/// Validates chunking parameters.
|
|
400
|
-
///
|
|
401
|
-
/// Checks that `max_chars > 0` and `max_overlap < max_chars`.
|
|
402
|
-
///
|
|
403
|
-
/// # Arguments
|
|
404
|
-
///
|
|
405
|
-
/// * `max_chars` - Maximum characters per chunk
|
|
406
|
-
/// * `max_overlap` - Maximum overlap between chunks
|
|
407
|
-
///
|
|
408
|
-
/// # Returns
|
|
409
|
-
///
|
|
410
|
-
/// - `1` if valid
|
|
411
|
-
/// - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
412
|
-
///
|
|
413
|
-
/// # C Signature
|
|
414
|
-
///
|
|
415
|
-
/// ```c
|
|
416
|
-
/// int32_t kreuzberg_validate_chunking_params(size_t max_chars, size_t max_overlap);
|
|
417
|
-
/// ```
|
|
418
|
-
#[unsafe(no_mangle)]
|
|
419
|
-
pub extern "C" fn kreuzberg_validate_chunking_params(max_chars: usize, max_overlap: usize) -> i32 {
|
|
420
|
-
match validate_chunking_params(max_chars, max_overlap) {
|
|
421
|
-
Ok(()) => 1,
|
|
422
|
-
Err(e) => {
|
|
423
|
-
set_last_error(e.to_string());
|
|
424
|
-
0
|
|
425
|
-
}
|
|
426
|
-
}
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
/// Returns valid binarization methods as a JSON array string.
|
|
430
|
-
///
|
|
431
|
-
/// The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
|
|
432
|
-
///
|
|
433
|
-
/// # Returns
|
|
434
|
-
///
|
|
435
|
-
/// A pointer to a dynamically allocated C string containing a JSON array of valid methods.
|
|
436
|
-
/// Returns NULL if memory allocation fails (error message set via `set_last_error()`).
|
|
437
|
-
///
|
|
438
|
-
/// # Example
|
|
439
|
-
///
|
|
440
|
-
/// The returned JSON string looks like: `["otsu","adaptive","sauvola"]`
|
|
441
|
-
///
|
|
442
|
-
/// # C Signature
|
|
443
|
-
///
|
|
444
|
-
/// ```c
|
|
445
|
-
/// char* kreuzberg_get_valid_binarization_methods(void);
|
|
446
|
-
/// ```
|
|
447
|
-
#[unsafe(no_mangle)]
|
|
448
|
-
pub extern "C" fn kreuzberg_get_valid_binarization_methods() -> *mut c_char {
|
|
449
|
-
let json = format!(
|
|
450
|
-
"[{}]",
|
|
451
|
-
VALID_BINARIZATION_METHODS
|
|
452
|
-
.iter()
|
|
453
|
-
.map(|m| format!("\"{}\"", m))
|
|
454
|
-
.collect::<Vec<_>>()
|
|
455
|
-
.join(",")
|
|
456
|
-
);
|
|
457
|
-
|
|
458
|
-
match std::ffi::CString::new(json) {
|
|
459
|
-
Ok(c_str) => c_str.into_raw(),
|
|
460
|
-
Err(e) => {
|
|
461
|
-
set_last_error(format!("Failed to allocate string: {}", e));
|
|
462
|
-
std::ptr::null_mut()
|
|
463
|
-
}
|
|
464
|
-
}
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
/// Returns valid language codes as a JSON array string.
|
|
468
|
-
///
|
|
469
|
-
/// The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
|
|
470
|
-
///
|
|
471
|
-
/// # Returns
|
|
472
|
-
///
|
|
473
|
-
/// A pointer to a dynamically allocated C string containing a JSON array of valid codes.
|
|
474
|
-
/// Returns NULL if memory allocation fails (error message set via `set_last_error()`).
|
|
475
|
-
///
|
|
476
|
-
/// # C Signature
|
|
477
|
-
///
|
|
478
|
-
/// ```c
|
|
479
|
-
/// char* kreuzberg_get_valid_language_codes(void);
|
|
480
|
-
/// ```
|
|
481
|
-
#[unsafe(no_mangle)]
|
|
482
|
-
pub extern "C" fn kreuzberg_get_valid_language_codes() -> *mut c_char {
|
|
483
|
-
let json = format!(
|
|
484
|
-
"[{}]",
|
|
485
|
-
VALID_LANGUAGE_CODES
|
|
486
|
-
.iter()
|
|
487
|
-
.map(|c| format!("\"{}\"", c))
|
|
488
|
-
.collect::<Vec<_>>()
|
|
489
|
-
.join(",")
|
|
490
|
-
);
|
|
491
|
-
|
|
492
|
-
match std::ffi::CString::new(json) {
|
|
493
|
-
Ok(c_str) => c_str.into_raw(),
|
|
494
|
-
Err(e) => {
|
|
495
|
-
set_last_error(format!("Failed to allocate string: {}", e));
|
|
496
|
-
std::ptr::null_mut()
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
/// Returns valid OCR backends as a JSON array string.
|
|
502
|
-
///
|
|
503
|
-
/// The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
|
|
504
|
-
///
|
|
505
|
-
/// # Returns
|
|
506
|
-
///
|
|
507
|
-
/// A pointer to a dynamically allocated C string containing a JSON array of valid backends.
|
|
508
|
-
/// Returns NULL if memory allocation fails (error message set via `set_last_error()`).
|
|
509
|
-
///
|
|
510
|
-
/// # C Signature
|
|
511
|
-
///
|
|
512
|
-
/// ```c
|
|
513
|
-
/// char* kreuzberg_get_valid_ocr_backends(void);
|
|
514
|
-
/// ```
|
|
515
|
-
#[unsafe(no_mangle)]
|
|
516
|
-
pub extern "C" fn kreuzberg_get_valid_ocr_backends() -> *mut c_char {
|
|
517
|
-
let json = format!(
|
|
518
|
-
"[{}]",
|
|
519
|
-
VALID_OCR_BACKENDS
|
|
520
|
-
.iter()
|
|
521
|
-
.map(|b| format!("\"{}\"", b))
|
|
522
|
-
.collect::<Vec<_>>()
|
|
523
|
-
.join(",")
|
|
524
|
-
);
|
|
525
|
-
|
|
526
|
-
match std::ffi::CString::new(json) {
|
|
527
|
-
Ok(c_str) => c_str.into_raw(),
|
|
528
|
-
Err(e) => {
|
|
529
|
-
set_last_error(format!("Failed to allocate string: {}", e));
|
|
530
|
-
std::ptr::null_mut()
|
|
531
|
-
}
|
|
532
|
-
}
|
|
533
|
-
}
|
|
534
|
-
|
|
535
|
-
/// Returns valid token reduction levels as a JSON array string.
|
|
536
|
-
///
|
|
537
|
-
/// The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
|
|
538
|
-
///
|
|
539
|
-
/// # Returns
|
|
540
|
-
///
|
|
541
|
-
/// A pointer to a dynamically allocated C string containing a JSON array of valid levels.
|
|
542
|
-
/// Returns NULL if memory allocation fails (error message set via `set_last_error()`).
|
|
543
|
-
///
|
|
544
|
-
/// # C Signature
|
|
545
|
-
///
|
|
546
|
-
/// ```c
|
|
547
|
-
/// char* kreuzberg_get_valid_token_reduction_levels(void);
|
|
548
|
-
/// ```
|
|
549
|
-
#[unsafe(no_mangle)]
|
|
550
|
-
pub extern "C" fn kreuzberg_get_valid_token_reduction_levels() -> *mut c_char {
|
|
551
|
-
let json = format!(
|
|
552
|
-
"[{}]",
|
|
553
|
-
VALID_TOKEN_REDUCTION_LEVELS
|
|
554
|
-
.iter()
|
|
555
|
-
.map(|l| format!("\"{}\"", l))
|
|
556
|
-
.collect::<Vec<_>>()
|
|
557
|
-
.join(",")
|
|
558
|
-
);
|
|
559
|
-
|
|
560
|
-
match std::ffi::CString::new(json) {
|
|
561
|
-
Ok(c_str) => c_str.into_raw(),
|
|
562
|
-
Err(e) => {
|
|
563
|
-
set_last_error(format!("Failed to allocate string: {}", e));
|
|
564
|
-
std::ptr::null_mut()
|
|
565
|
-
}
|
|
566
|
-
}
|
|
567
|
-
}
|
|
568
|
-
|
|
569
|
-
#[cfg(test)]
|
|
570
|
-
mod tests {
|
|
571
|
-
use super::*;
|
|
572
|
-
|
|
573
|
-
#[test]
|
|
574
|
-
fn test_validate_binarization_method_valid() {
|
|
575
|
-
unsafe {
|
|
576
|
-
assert_eq!(kreuzberg_validate_binarization_method(c"otsu".as_ptr()), 1);
|
|
577
|
-
assert_eq!(kreuzberg_validate_binarization_method(c"adaptive".as_ptr()), 1);
|
|
578
|
-
assert_eq!(kreuzberg_validate_binarization_method(c"sauvola".as_ptr()), 1);
|
|
579
|
-
}
|
|
580
|
-
}
|
|
581
|
-
|
|
582
|
-
#[test]
|
|
583
|
-
fn test_validate_binarization_method_invalid() {
|
|
584
|
-
unsafe {
|
|
585
|
-
assert_eq!(kreuzberg_validate_binarization_method(c"invalid".as_ptr()), 0);
|
|
586
|
-
}
|
|
587
|
-
}
|
|
588
|
-
|
|
589
|
-
#[test]
|
|
590
|
-
fn test_validate_binarization_method_null() {
|
|
591
|
-
unsafe {
|
|
592
|
-
assert_eq!(kreuzberg_validate_binarization_method(std::ptr::null()), 0);
|
|
593
|
-
}
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
#[test]
|
|
597
|
-
fn test_validate_ocr_backend_valid() {
|
|
598
|
-
unsafe {
|
|
599
|
-
assert_eq!(kreuzberg_validate_ocr_backend(c"tesseract".as_ptr()), 1);
|
|
600
|
-
assert_eq!(kreuzberg_validate_ocr_backend(c"easyocr".as_ptr()), 1);
|
|
601
|
-
assert_eq!(kreuzberg_validate_ocr_backend(c"paddleocr".as_ptr()), 1);
|
|
602
|
-
}
|
|
603
|
-
}
|
|
604
|
-
|
|
605
|
-
#[test]
|
|
606
|
-
fn test_validate_ocr_backend_invalid() {
|
|
607
|
-
unsafe {
|
|
608
|
-
assert_eq!(kreuzberg_validate_ocr_backend(c"invalid_backend".as_ptr()), 0);
|
|
609
|
-
}
|
|
610
|
-
}
|
|
611
|
-
|
|
612
|
-
#[test]
|
|
613
|
-
fn test_validate_ocr_backend_null() {
|
|
614
|
-
unsafe {
|
|
615
|
-
assert_eq!(kreuzberg_validate_ocr_backend(std::ptr::null()), 0);
|
|
616
|
-
}
|
|
617
|
-
}
|
|
618
|
-
|
|
619
|
-
#[test]
|
|
620
|
-
fn test_validate_language_code_valid_2letter() {
|
|
621
|
-
unsafe {
|
|
622
|
-
assert_eq!(kreuzberg_validate_language_code(c"en".as_ptr()), 1);
|
|
623
|
-
assert_eq!(kreuzberg_validate_language_code(c"de".as_ptr()), 1);
|
|
624
|
-
assert_eq!(kreuzberg_validate_language_code(c"fr".as_ptr()), 1);
|
|
625
|
-
}
|
|
626
|
-
}
|
|
627
|
-
|
|
628
|
-
#[test]
|
|
629
|
-
fn test_validate_language_code_valid_3letter() {
|
|
630
|
-
unsafe {
|
|
631
|
-
assert_eq!(kreuzberg_validate_language_code(c"eng".as_ptr()), 1);
|
|
632
|
-
assert_eq!(kreuzberg_validate_language_code(c"deu".as_ptr()), 1);
|
|
633
|
-
assert_eq!(kreuzberg_validate_language_code(c"fra".as_ptr()), 1);
|
|
634
|
-
}
|
|
635
|
-
}
|
|
636
|
-
|
|
637
|
-
#[test]
|
|
638
|
-
fn test_validate_language_code_invalid() {
|
|
639
|
-
unsafe {
|
|
640
|
-
assert_eq!(kreuzberg_validate_language_code(c"invalid".as_ptr()), 0);
|
|
641
|
-
assert_eq!(kreuzberg_validate_language_code(c"xx".as_ptr()), 0);
|
|
642
|
-
}
|
|
643
|
-
}
|
|
644
|
-
|
|
645
|
-
#[test]
|
|
646
|
-
fn test_validate_language_code_null() {
|
|
647
|
-
unsafe {
|
|
648
|
-
assert_eq!(kreuzberg_validate_language_code(std::ptr::null()), 0);
|
|
649
|
-
}
|
|
650
|
-
}
|
|
651
|
-
|
|
652
|
-
#[test]
|
|
653
|
-
fn test_validate_token_reduction_level_valid() {
|
|
654
|
-
unsafe {
|
|
655
|
-
assert_eq!(kreuzberg_validate_token_reduction_level(c"off".as_ptr()), 1);
|
|
656
|
-
assert_eq!(kreuzberg_validate_token_reduction_level(c"light".as_ptr()), 1);
|
|
657
|
-
assert_eq!(kreuzberg_validate_token_reduction_level(c"moderate".as_ptr()), 1);
|
|
658
|
-
assert_eq!(kreuzberg_validate_token_reduction_level(c"aggressive".as_ptr()), 1);
|
|
659
|
-
assert_eq!(kreuzberg_validate_token_reduction_level(c"maximum".as_ptr()), 1);
|
|
660
|
-
}
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
#[test]
|
|
664
|
-
fn test_validate_token_reduction_level_invalid() {
|
|
665
|
-
unsafe {
|
|
666
|
-
assert_eq!(kreuzberg_validate_token_reduction_level(c"extreme".as_ptr()), 0);
|
|
667
|
-
}
|
|
668
|
-
}
|
|
669
|
-
|
|
670
|
-
#[test]
|
|
671
|
-
fn test_validate_token_reduction_level_null() {
|
|
672
|
-
unsafe {
|
|
673
|
-
assert_eq!(kreuzberg_validate_token_reduction_level(std::ptr::null()), 0);
|
|
674
|
-
}
|
|
675
|
-
}
|
|
676
|
-
|
|
677
|
-
#[test]
|
|
678
|
-
fn test_validate_tesseract_psm_valid() {
|
|
679
|
-
for psm in 0..=13 {
|
|
680
|
-
assert_eq!(kreuzberg_validate_tesseract_psm(psm), 1, "PSM {} should be valid", psm);
|
|
681
|
-
}
|
|
682
|
-
}
|
|
683
|
-
|
|
684
|
-
#[test]
|
|
685
|
-
fn test_validate_tesseract_psm_invalid() {
|
|
686
|
-
assert_eq!(kreuzberg_validate_tesseract_psm(-1), 0);
|
|
687
|
-
assert_eq!(kreuzberg_validate_tesseract_psm(14), 0);
|
|
688
|
-
assert_eq!(kreuzberg_validate_tesseract_psm(100), 0);
|
|
689
|
-
}
|
|
690
|
-
|
|
691
|
-
#[test]
|
|
692
|
-
fn test_validate_tesseract_oem_valid() {
|
|
693
|
-
for oem in 0..=3 {
|
|
694
|
-
assert_eq!(kreuzberg_validate_tesseract_oem(oem), 1, "OEM {} should be valid", oem);
|
|
695
|
-
}
|
|
696
|
-
}
|
|
697
|
-
|
|
698
|
-
#[test]
|
|
699
|
-
fn test_validate_tesseract_oem_invalid() {
|
|
700
|
-
assert_eq!(kreuzberg_validate_tesseract_oem(-1), 0);
|
|
701
|
-
assert_eq!(kreuzberg_validate_tesseract_oem(4), 0);
|
|
702
|
-
assert_eq!(kreuzberg_validate_tesseract_oem(10), 0);
|
|
703
|
-
}
|
|
704
|
-
|
|
705
|
-
#[test]
|
|
706
|
-
fn test_validate_output_format_valid() {
|
|
707
|
-
unsafe {
|
|
708
|
-
assert_eq!(kreuzberg_validate_output_format(c"text".as_ptr()), 1);
|
|
709
|
-
assert_eq!(kreuzberg_validate_output_format(c"markdown".as_ptr()), 1);
|
|
710
|
-
}
|
|
711
|
-
}
|
|
712
|
-
|
|
713
|
-
#[test]
|
|
714
|
-
fn test_validate_output_format_invalid() {
|
|
715
|
-
unsafe {
|
|
716
|
-
assert_eq!(kreuzberg_validate_output_format(c"json".as_ptr()), 0);
|
|
717
|
-
}
|
|
718
|
-
}
|
|
719
|
-
|
|
720
|
-
#[test]
|
|
721
|
-
fn test_validate_output_format_null() {
|
|
722
|
-
unsafe {
|
|
723
|
-
assert_eq!(kreuzberg_validate_output_format(std::ptr::null()), 0);
|
|
724
|
-
}
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
#[test]
|
|
728
|
-
fn test_validate_confidence_valid() {
|
|
729
|
-
assert_eq!(kreuzberg_validate_confidence(0.0), 1);
|
|
730
|
-
assert_eq!(kreuzberg_validate_confidence(0.5), 1);
|
|
731
|
-
assert_eq!(kreuzberg_validate_confidence(1.0), 1);
|
|
732
|
-
}
|
|
733
|
-
|
|
734
|
-
#[test]
|
|
735
|
-
fn test_validate_confidence_invalid() {
|
|
736
|
-
assert_eq!(kreuzberg_validate_confidence(-0.1), 0);
|
|
737
|
-
assert_eq!(kreuzberg_validate_confidence(1.1), 0);
|
|
738
|
-
assert_eq!(kreuzberg_validate_confidence(2.0), 0);
|
|
739
|
-
}
|
|
740
|
-
|
|
741
|
-
#[test]
|
|
742
|
-
fn test_validate_dpi_valid() {
|
|
743
|
-
assert_eq!(kreuzberg_validate_dpi(72), 1);
|
|
744
|
-
assert_eq!(kreuzberg_validate_dpi(96), 1);
|
|
745
|
-
assert_eq!(kreuzberg_validate_dpi(300), 1);
|
|
746
|
-
assert_eq!(kreuzberg_validate_dpi(600), 1);
|
|
747
|
-
}
|
|
748
|
-
|
|
749
|
-
#[test]
|
|
750
|
-
fn test_validate_dpi_invalid() {
|
|
751
|
-
assert_eq!(kreuzberg_validate_dpi(0), 0);
|
|
752
|
-
assert_eq!(kreuzberg_validate_dpi(-1), 0);
|
|
753
|
-
assert_eq!(kreuzberg_validate_dpi(2401), 0);
|
|
754
|
-
}
|
|
755
|
-
|
|
756
|
-
#[test]
|
|
757
|
-
fn test_validate_chunking_params_valid() {
|
|
758
|
-
assert_eq!(kreuzberg_validate_chunking_params(1000, 200), 1);
|
|
759
|
-
assert_eq!(kreuzberg_validate_chunking_params(500, 50), 1);
|
|
760
|
-
assert_eq!(kreuzberg_validate_chunking_params(1, 0), 1);
|
|
761
|
-
}
|
|
762
|
-
|
|
763
|
-
#[test]
|
|
764
|
-
fn test_validate_chunking_params_invalid_zero_chars() {
|
|
765
|
-
assert_eq!(kreuzberg_validate_chunking_params(0, 100), 0);
|
|
766
|
-
}
|
|
767
|
-
|
|
768
|
-
#[test]
|
|
769
|
-
fn test_validate_chunking_params_invalid_overlap() {
|
|
770
|
-
assert_eq!(kreuzberg_validate_chunking_params(100, 100), 0);
|
|
771
|
-
assert_eq!(kreuzberg_validate_chunking_params(100, 150), 0);
|
|
772
|
-
}
|
|
773
|
-
|
|
774
|
-
#[test]
|
|
775
|
-
fn test_get_valid_binarization_methods() {
|
|
776
|
-
unsafe {
|
|
777
|
-
let json_ptr = kreuzberg_get_valid_binarization_methods();
|
|
778
|
-
assert!(!json_ptr.is_null(), "Should return non-null pointer");
|
|
779
|
-
|
|
780
|
-
let c_str = CStr::from_ptr(json_ptr);
|
|
781
|
-
let json_str = c_str.to_str().expect("Should be valid UTF-8");
|
|
782
|
-
|
|
783
|
-
assert!(json_str.contains("otsu"));
|
|
784
|
-
assert!(json_str.contains("adaptive"));
|
|
785
|
-
assert!(json_str.contains("sauvola"));
|
|
786
|
-
assert!(json_str.starts_with('[') && json_str.ends_with(']'));
|
|
787
|
-
|
|
788
|
-
let _ = std::ffi::CString::from_raw(json_ptr as *mut c_char);
|
|
789
|
-
}
|
|
790
|
-
}
|
|
791
|
-
|
|
792
|
-
#[test]
|
|
793
|
-
fn test_get_valid_language_codes() {
|
|
794
|
-
unsafe {
|
|
795
|
-
let json_ptr = kreuzberg_get_valid_language_codes();
|
|
796
|
-
assert!(!json_ptr.is_null(), "Should return non-null pointer");
|
|
797
|
-
|
|
798
|
-
let c_str = CStr::from_ptr(json_ptr);
|
|
799
|
-
let json_str = c_str.to_str().expect("Should be valid UTF-8");
|
|
800
|
-
|
|
801
|
-
assert!(json_str.contains("en"));
|
|
802
|
-
assert!(json_str.contains("de"));
|
|
803
|
-
assert!(json_str.contains("eng"));
|
|
804
|
-
assert!(json_str.contains("deu"));
|
|
805
|
-
assert!(json_str.starts_with('[') && json_str.ends_with(']'));
|
|
806
|
-
|
|
807
|
-
let _ = std::ffi::CString::from_raw(json_ptr as *mut c_char);
|
|
808
|
-
}
|
|
809
|
-
}
|
|
810
|
-
|
|
811
|
-
#[test]
|
|
812
|
-
fn test_get_valid_ocr_backends() {
|
|
813
|
-
unsafe {
|
|
814
|
-
let json_ptr = kreuzberg_get_valid_ocr_backends();
|
|
815
|
-
assert!(!json_ptr.is_null(), "Should return non-null pointer");
|
|
816
|
-
|
|
817
|
-
let c_str = CStr::from_ptr(json_ptr);
|
|
818
|
-
let json_str = c_str.to_str().expect("Should be valid UTF-8");
|
|
819
|
-
|
|
820
|
-
assert!(json_str.contains("tesseract"));
|
|
821
|
-
assert!(json_str.contains("easyocr"));
|
|
822
|
-
assert!(json_str.contains("paddleocr"));
|
|
823
|
-
assert!(json_str.starts_with('[') && json_str.ends_with(']'));
|
|
824
|
-
|
|
825
|
-
let _ = std::ffi::CString::from_raw(json_ptr as *mut c_char);
|
|
826
|
-
}
|
|
827
|
-
}
|
|
828
|
-
|
|
829
|
-
#[test]
|
|
830
|
-
fn test_get_valid_token_reduction_levels() {
|
|
831
|
-
unsafe {
|
|
832
|
-
let json_ptr = kreuzberg_get_valid_token_reduction_levels();
|
|
833
|
-
assert!(!json_ptr.is_null(), "Should return non-null pointer");
|
|
834
|
-
|
|
835
|
-
let c_str = CStr::from_ptr(json_ptr);
|
|
836
|
-
let json_str = c_str.to_str().expect("Should be valid UTF-8");
|
|
837
|
-
|
|
838
|
-
assert!(json_str.contains("off"));
|
|
839
|
-
assert!(json_str.contains("light"));
|
|
840
|
-
assert!(json_str.contains("moderate"));
|
|
841
|
-
assert!(json_str.contains("aggressive"));
|
|
842
|
-
assert!(json_str.contains("maximum"));
|
|
843
|
-
assert!(json_str.starts_with('[') && json_str.ends_with(']'));
|
|
844
|
-
|
|
845
|
-
let _ = std::ffi::CString::from_raw(json_ptr as *mut c_char);
|
|
846
|
-
}
|
|
847
|
-
}
|
|
848
|
-
}
|