kreuzberg 4.1.1 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +8 -5
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
- data/kreuzberg.gemspec +14 -2
- data/lib/kreuzberg/api_proxy.rb +0 -1
- data/lib/kreuzberg/cli_proxy.rb +0 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/mcp_proxy.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- data/vendor/kreuzberg-tesseract/build.rs +4 -4
- data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
- metadata +13 -2
|
@@ -6,16 +6,16 @@
|
|
|
6
6
|
#![allow(clippy::not_unsafe_ptr_arg_deref)]
|
|
7
7
|
#![allow(clippy::cmp_null)]
|
|
8
8
|
|
|
9
|
-
//! # tesseract
|
|
9
|
+
//! # kreuzberg-tesseract
|
|
10
10
|
//!
|
|
11
|
-
//! `tesseract
|
|
11
|
+
//! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
|
|
12
12
|
//! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
|
|
13
13
|
//! easily accessible in Rust projects while handling the complexity of interfacing
|
|
14
14
|
//! with the underlying C++ libraries.
|
|
15
15
|
//!
|
|
16
16
|
//! ## Usage
|
|
17
17
|
//!
|
|
18
|
-
//! Here's a basic example of how to use `tesseract
|
|
18
|
+
//! Here's a basic example of how to use `kreuzberg-tesseract`:
|
|
19
19
|
//!
|
|
20
20
|
//! ```rust
|
|
21
21
|
//! use std::path::PathBuf;
|
|
@@ -28,16 +28,16 @@
|
|
|
28
28
|
//! PathBuf::from(home_dir)
|
|
29
29
|
//! .join("Library")
|
|
30
30
|
//! .join("Application Support")
|
|
31
|
-
//! .join("tesseract
|
|
31
|
+
//! .join("kreuzberg-tesseract")
|
|
32
32
|
//! .join("tessdata")
|
|
33
33
|
//! } else if cfg!(target_os = "linux") {
|
|
34
34
|
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
|
35
35
|
//! PathBuf::from(home_dir)
|
|
36
|
-
//! .join(".tesseract
|
|
36
|
+
//! .join(".kreuzberg-tesseract")
|
|
37
37
|
//! .join("tessdata")
|
|
38
38
|
//! } else if cfg!(target_os = "windows") {
|
|
39
39
|
//! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
|
40
|
-
//! .join("tesseract
|
|
40
|
+
//! .join("kreuzberg-tesseract")
|
|
41
41
|
//! .join("tessdata")
|
|
42
42
|
//! } else {
|
|
43
43
|
//! panic!("Unsupported operating system");
|
|
@@ -7,7 +7,7 @@ fn get_default_tessdata_dir() -> PathBuf {
|
|
|
7
7
|
PathBuf::from(home_dir)
|
|
8
8
|
.join("Library")
|
|
9
9
|
.join("Application Support")
|
|
10
|
-
.join("tesseract
|
|
10
|
+
.join("kreuzberg-tesseract")
|
|
11
11
|
.join("tessdata")
|
|
12
12
|
} else if cfg!(target_os = "linux") {
|
|
13
13
|
let system_paths = [
|
|
@@ -20,10 +20,10 @@ fn get_default_tessdata_dir() -> PathBuf {
|
|
|
20
20
|
}
|
|
21
21
|
}
|
|
22
22
|
let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
|
23
|
-
PathBuf::from(home_dir).join(".tesseract
|
|
23
|
+
PathBuf::from(home_dir).join(".kreuzberg-tesseract").join("tessdata")
|
|
24
24
|
} else if cfg!(target_os = "windows") {
|
|
25
25
|
PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
|
26
|
-
.join("tesseract
|
|
26
|
+
.join("kreuzberg-tesseract")
|
|
27
27
|
.join("tessdata")
|
|
28
28
|
} else {
|
|
29
29
|
panic!("Unsupported operating system");
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.
|
|
4
|
+
version: 4.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-26 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -209,6 +209,7 @@ files:
|
|
|
209
209
|
- ext/kreuzberg_rb/native/include/msvc_compat/strings.h
|
|
210
210
|
- ext/kreuzberg_rb/native/include/strings.h
|
|
211
211
|
- ext/kreuzberg_rb/native/include/unistd.h
|
|
212
|
+
- ext/kreuzberg_rb/native/libpdfium.so
|
|
212
213
|
- ext/kreuzberg_rb/native/src/batch.rs
|
|
213
214
|
- ext/kreuzberg_rb/native/src/config/mod.rs
|
|
214
215
|
- ext/kreuzberg_rb/native/src/config/types.rs
|
|
@@ -271,6 +272,7 @@ files:
|
|
|
271
272
|
- spec/fixtures/config.toml
|
|
272
273
|
- spec/fixtures/config.yaml
|
|
273
274
|
- spec/fixtures/invalid_config.toml
|
|
275
|
+
- spec/serialization_spec.rb
|
|
274
276
|
- spec/smoke/package_spec.rb
|
|
275
277
|
- spec/spec_helper.rb
|
|
276
278
|
- spec/unit/config/chunking_config_spec.rb
|
|
@@ -283,6 +285,7 @@ files:
|
|
|
283
285
|
- spec/unit/config/keyword_config_spec.rb
|
|
284
286
|
- spec/unit/config/language_detection_config_spec.rb
|
|
285
287
|
- spec/unit/config/ocr_config_spec.rb
|
|
288
|
+
- spec/unit/config/output_format_spec.rb
|
|
286
289
|
- spec/unit/config/page_config_spec.rb
|
|
287
290
|
- spec/unit/config/pdf_config_spec.rb
|
|
288
291
|
- spec/unit/config/postprocessor_config_spec.rb
|
|
@@ -705,6 +708,7 @@ files:
|
|
|
705
708
|
- vendor/kreuzberg/stopwords/zh_stopwords.json
|
|
706
709
|
- vendor/kreuzberg/stopwords/zu_stopwords.json
|
|
707
710
|
- vendor/kreuzberg/tests/api_chunk.rs
|
|
711
|
+
- vendor/kreuzberg/tests/api_consistency.rs
|
|
708
712
|
- vendor/kreuzberg/tests/api_embed.rs
|
|
709
713
|
- vendor/kreuzberg/tests/api_extract_multipart.rs
|
|
710
714
|
- vendor/kreuzberg/tests/api_large_pdf_extraction.rs
|
|
@@ -716,9 +720,11 @@ files:
|
|
|
716
720
|
- vendor/kreuzberg/tests/batch_processing.rs
|
|
717
721
|
- vendor/kreuzberg/tests/bibtex_parity_test.rs
|
|
718
722
|
- vendor/kreuzberg/tests/concurrency_stress.rs
|
|
723
|
+
- vendor/kreuzberg/tests/config_behavioral.rs
|
|
719
724
|
- vendor/kreuzberg/tests/config_features.rs
|
|
720
725
|
- vendor/kreuzberg/tests/config_integration_test.rs
|
|
721
726
|
- vendor/kreuzberg/tests/config_loading_tests.rs
|
|
727
|
+
- vendor/kreuzberg/tests/contract_mcp.rs
|
|
722
728
|
- vendor/kreuzberg/tests/core_integration.rs
|
|
723
729
|
- vendor/kreuzberg/tests/csv_integration.rs
|
|
724
730
|
- vendor/kreuzberg/tests/data/hierarchy_ground_truth.json
|
|
@@ -740,6 +746,7 @@ files:
|
|
|
740
746
|
- vendor/kreuzberg/tests/keywords_quality.rs
|
|
741
747
|
- vendor/kreuzberg/tests/latex_extractor_tests.rs
|
|
742
748
|
- vendor/kreuzberg/tests/markdown_extractor_tests.rs
|
|
749
|
+
- vendor/kreuzberg/tests/mcp_integration.rs
|
|
743
750
|
- vendor/kreuzberg/tests/mime_detection.rs
|
|
744
751
|
- vendor/kreuzberg/tests/ocr_configuration.rs
|
|
745
752
|
- vendor/kreuzberg/tests/ocr_errors.rs
|
|
@@ -766,6 +773,7 @@ files:
|
|
|
766
773
|
- vendor/kreuzberg/tests/rst_extractor_tests.rs
|
|
767
774
|
- vendor/kreuzberg/tests/rtf_extractor_tests.rs
|
|
768
775
|
- vendor/kreuzberg/tests/security_validation.rs
|
|
776
|
+
- vendor/kreuzberg/tests/serialization_integration.rs
|
|
769
777
|
- vendor/kreuzberg/tests/stopwords_integration_test.rs
|
|
770
778
|
- vendor/kreuzberg/tests/test_fastembed.rs
|
|
771
779
|
- vendor/kreuzberg/tests/typst_behavioral_tests.rs
|
|
@@ -791,6 +799,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
791
799
|
- - ">="
|
|
792
800
|
- !ruby/object:Gem::Version
|
|
793
801
|
version: 3.2.0
|
|
802
|
+
- - "<"
|
|
803
|
+
- !ruby/object:Gem::Version
|
|
804
|
+
version: '5.0'
|
|
794
805
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
795
806
|
requirements:
|
|
796
807
|
- - ">="
|