kreuzberg 4.1.1 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +8 -5
  4. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
  7. data/kreuzberg.gemspec +14 -2
  8. data/lib/kreuzberg/api_proxy.rb +0 -1
  9. data/lib/kreuzberg/cli_proxy.rb +0 -1
  10. data/lib/kreuzberg/config.rb +70 -35
  11. data/lib/kreuzberg/mcp_proxy.rb +0 -1
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/sig/kreuzberg.rbs +5 -1
  14. data/spec/binding/batch_operations_spec.rb +80 -0
  15. data/spec/binding/metadata_types_spec.rb +77 -57
  16. data/spec/serialization_spec.rb +134 -0
  17. data/spec/unit/config/output_format_spec.rb +380 -0
  18. data/vendor/Cargo.toml +1 -1
  19. data/vendor/kreuzberg/Cargo.toml +3 -3
  20. data/vendor/kreuzberg/README.md +1 -1
  21. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  22. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  23. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  24. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  25. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  26. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  27. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  28. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  29. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  30. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  31. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  32. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  33. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  34. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  35. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  36. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  37. data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
  38. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  39. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  40. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  41. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  42. data/vendor/kreuzberg/tests/core_integration.rs +55 -53
  43. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  44. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  45. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  46. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  47. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  48. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  49. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  50. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  51. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  52. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  53. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  54. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  55. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  56. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  57. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  58. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  59. data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
  60. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  61. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  62. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  63. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  64. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  65. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  67. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  68. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  69. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  70. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  71. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  72. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  73. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  74. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
  75. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  76. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  77. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  78. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  79. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  80. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  81. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  82. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  83. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  84. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  85. data/vendor/kreuzberg-tesseract/build.rs +4 -4
  86. data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
  87. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
  88. metadata +13 -2
@@ -6,16 +6,16 @@
6
6
  #![allow(clippy::not_unsafe_ptr_arg_deref)]
7
7
  #![allow(clippy::cmp_null)]
8
8
 
9
- //! # tesseract-rs
9
+ //! # kreuzberg-tesseract
10
10
  //!
11
- //! `tesseract-rs` provides safe Rust bindings for Tesseract OCR with built-in compilation
11
+ //! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
12
12
  //! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
13
13
  //! easily accessible in Rust projects while handling the complexity of interfacing
14
14
  //! with the underlying C++ libraries.
15
15
  //!
16
16
  //! ## Usage
17
17
  //!
18
- //! Here's a basic example of how to use `tesseract-rs`:
18
+ //! Here's a basic example of how to use `kreuzberg-tesseract`:
19
19
  //!
20
20
  //! ```rust
21
21
  //! use std::path::PathBuf;
@@ -28,16 +28,16 @@
28
28
  //! PathBuf::from(home_dir)
29
29
  //! .join("Library")
30
30
  //! .join("Application Support")
31
- //! .join("tesseract-rs")
31
+ //! .join("kreuzberg-tesseract")
32
32
  //! .join("tessdata")
33
33
  //! } else if cfg!(target_os = "linux") {
34
34
  //! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
35
35
  //! PathBuf::from(home_dir)
36
- //! .join(".tesseract-rs")
36
+ //! .join(".kreuzberg-tesseract")
37
37
  //! .join("tessdata")
38
38
  //! } else if cfg!(target_os = "windows") {
39
39
  //! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
40
- //! .join("tesseract-rs")
40
+ //! .join("kreuzberg-tesseract")
41
41
  //! .join("tessdata")
42
42
  //! } else {
43
43
  //! panic!("Unsupported operating system");
@@ -7,7 +7,7 @@ fn get_default_tessdata_dir() -> PathBuf {
7
7
  PathBuf::from(home_dir)
8
8
  .join("Library")
9
9
  .join("Application Support")
10
- .join("tesseract-rs")
10
+ .join("kreuzberg-tesseract")
11
11
  .join("tessdata")
12
12
  } else if cfg!(target_os = "linux") {
13
13
  let system_paths = [
@@ -20,10 +20,10 @@ fn get_default_tessdata_dir() -> PathBuf {
20
20
  }
21
21
  }
22
22
  let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
23
- PathBuf::from(home_dir).join(".tesseract-rs").join("tessdata")
23
+ PathBuf::from(home_dir).join(".kreuzberg-tesseract").join("tessdata")
24
24
  } else if cfg!(target_os = "windows") {
25
25
  PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
26
- .join("tesseract-rs")
26
+ .join("kreuzberg-tesseract")
27
27
  .join("tessdata")
28
28
  } else {
29
29
  panic!("Unsupported operating system");
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.1.1
4
+ version: 4.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-23 00:00:00.000000000 Z
11
+ date: 2026-01-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -209,6 +209,7 @@ files:
209
209
  - ext/kreuzberg_rb/native/include/msvc_compat/strings.h
210
210
  - ext/kreuzberg_rb/native/include/strings.h
211
211
  - ext/kreuzberg_rb/native/include/unistd.h
212
+ - ext/kreuzberg_rb/native/libpdfium.so
212
213
  - ext/kreuzberg_rb/native/src/batch.rs
213
214
  - ext/kreuzberg_rb/native/src/config/mod.rs
214
215
  - ext/kreuzberg_rb/native/src/config/types.rs
@@ -271,6 +272,7 @@ files:
271
272
  - spec/fixtures/config.toml
272
273
  - spec/fixtures/config.yaml
273
274
  - spec/fixtures/invalid_config.toml
275
+ - spec/serialization_spec.rb
274
276
  - spec/smoke/package_spec.rb
275
277
  - spec/spec_helper.rb
276
278
  - spec/unit/config/chunking_config_spec.rb
@@ -283,6 +285,7 @@ files:
283
285
  - spec/unit/config/keyword_config_spec.rb
284
286
  - spec/unit/config/language_detection_config_spec.rb
285
287
  - spec/unit/config/ocr_config_spec.rb
288
+ - spec/unit/config/output_format_spec.rb
286
289
  - spec/unit/config/page_config_spec.rb
287
290
  - spec/unit/config/pdf_config_spec.rb
288
291
  - spec/unit/config/postprocessor_config_spec.rb
@@ -705,6 +708,7 @@ files:
705
708
  - vendor/kreuzberg/stopwords/zh_stopwords.json
706
709
  - vendor/kreuzberg/stopwords/zu_stopwords.json
707
710
  - vendor/kreuzberg/tests/api_chunk.rs
711
+ - vendor/kreuzberg/tests/api_consistency.rs
708
712
  - vendor/kreuzberg/tests/api_embed.rs
709
713
  - vendor/kreuzberg/tests/api_extract_multipart.rs
710
714
  - vendor/kreuzberg/tests/api_large_pdf_extraction.rs
@@ -716,9 +720,11 @@ files:
716
720
  - vendor/kreuzberg/tests/batch_processing.rs
717
721
  - vendor/kreuzberg/tests/bibtex_parity_test.rs
718
722
  - vendor/kreuzberg/tests/concurrency_stress.rs
723
+ - vendor/kreuzberg/tests/config_behavioral.rs
719
724
  - vendor/kreuzberg/tests/config_features.rs
720
725
  - vendor/kreuzberg/tests/config_integration_test.rs
721
726
  - vendor/kreuzberg/tests/config_loading_tests.rs
727
+ - vendor/kreuzberg/tests/contract_mcp.rs
722
728
  - vendor/kreuzberg/tests/core_integration.rs
723
729
  - vendor/kreuzberg/tests/csv_integration.rs
724
730
  - vendor/kreuzberg/tests/data/hierarchy_ground_truth.json
@@ -740,6 +746,7 @@ files:
740
746
  - vendor/kreuzberg/tests/keywords_quality.rs
741
747
  - vendor/kreuzberg/tests/latex_extractor_tests.rs
742
748
  - vendor/kreuzberg/tests/markdown_extractor_tests.rs
749
+ - vendor/kreuzberg/tests/mcp_integration.rs
743
750
  - vendor/kreuzberg/tests/mime_detection.rs
744
751
  - vendor/kreuzberg/tests/ocr_configuration.rs
745
752
  - vendor/kreuzberg/tests/ocr_errors.rs
@@ -766,6 +773,7 @@ files:
766
773
  - vendor/kreuzberg/tests/rst_extractor_tests.rs
767
774
  - vendor/kreuzberg/tests/rtf_extractor_tests.rs
768
775
  - vendor/kreuzberg/tests/security_validation.rs
776
+ - vendor/kreuzberg/tests/serialization_integration.rs
769
777
  - vendor/kreuzberg/tests/stopwords_integration_test.rs
770
778
  - vendor/kreuzberg/tests/test_fastembed.rs
771
779
  - vendor/kreuzberg/tests/typst_behavioral_tests.rs
@@ -791,6 +799,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
791
799
  - - ">="
792
800
  - !ruby/object:Gem::Version
793
801
  version: 3.2.0
802
+ - - "<"
803
+ - !ruby/object:Gem::Version
804
+ version: '5.0'
794
805
  required_rubygems_version: !ruby/object:Gem::Requirement
795
806
  requirements:
796
807
  - - ">="