kreuzberg 4.1.2 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  5. data/kreuzberg.gemspec +13 -1
  6. data/lib/kreuzberg/config.rb +70 -35
  7. data/lib/kreuzberg/version.rb +1 -1
  8. data/sig/kreuzberg.rbs +5 -1
  9. data/spec/binding/batch_operations_spec.rb +80 -0
  10. data/spec/binding/metadata_types_spec.rb +77 -57
  11. data/spec/serialization_spec.rb +134 -0
  12. data/spec/unit/config/output_format_spec.rb +380 -0
  13. data/vendor/Cargo.toml +1 -1
  14. data/vendor/kreuzberg/Cargo.toml +1 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  17. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  18. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  19. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  20. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  21. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  22. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  23. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  24. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  25. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  26. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  27. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  28. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  29. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  30. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  31. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  32. data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
  33. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  34. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  35. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  36. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  37. data/vendor/kreuzberg/tests/core_integration.rs +55 -53
  38. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  39. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  40. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  41. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  42. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  43. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  44. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  45. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  46. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  47. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  48. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  49. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  50. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  51. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  52. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  53. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  54. data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
  55. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  56. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  57. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  58. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  59. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  60. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  61. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  62. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  63. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  64. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  65. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  66. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  67. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  68. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  69. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
  70. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  71. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  72. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  73. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  74. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  75. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  76. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  77. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  78. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  79. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  80. metadata +10 -2
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.1.2
4
+ version: 4.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-25 00:00:00.000000000 Z
11
+ date: 2026-01-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -209,6 +209,7 @@ files:
209
209
  - ext/kreuzberg_rb/native/include/msvc_compat/strings.h
210
210
  - ext/kreuzberg_rb/native/include/strings.h
211
211
  - ext/kreuzberg_rb/native/include/unistd.h
212
+ - ext/kreuzberg_rb/native/libpdfium.so
212
213
  - ext/kreuzberg_rb/native/src/batch.rs
213
214
  - ext/kreuzberg_rb/native/src/config/mod.rs
214
215
  - ext/kreuzberg_rb/native/src/config/types.rs
@@ -271,6 +272,7 @@ files:
271
272
  - spec/fixtures/config.toml
272
273
  - spec/fixtures/config.yaml
273
274
  - spec/fixtures/invalid_config.toml
275
+ - spec/serialization_spec.rb
274
276
  - spec/smoke/package_spec.rb
275
277
  - spec/spec_helper.rb
276
278
  - spec/unit/config/chunking_config_spec.rb
@@ -283,6 +285,7 @@ files:
283
285
  - spec/unit/config/keyword_config_spec.rb
284
286
  - spec/unit/config/language_detection_config_spec.rb
285
287
  - spec/unit/config/ocr_config_spec.rb
288
+ - spec/unit/config/output_format_spec.rb
286
289
  - spec/unit/config/page_config_spec.rb
287
290
  - spec/unit/config/pdf_config_spec.rb
288
291
  - spec/unit/config/postprocessor_config_spec.rb
@@ -705,6 +708,7 @@ files:
705
708
  - vendor/kreuzberg/stopwords/zh_stopwords.json
706
709
  - vendor/kreuzberg/stopwords/zu_stopwords.json
707
710
  - vendor/kreuzberg/tests/api_chunk.rs
711
+ - vendor/kreuzberg/tests/api_consistency.rs
708
712
  - vendor/kreuzberg/tests/api_embed.rs
709
713
  - vendor/kreuzberg/tests/api_extract_multipart.rs
710
714
  - vendor/kreuzberg/tests/api_large_pdf_extraction.rs
@@ -716,9 +720,11 @@ files:
716
720
  - vendor/kreuzberg/tests/batch_processing.rs
717
721
  - vendor/kreuzberg/tests/bibtex_parity_test.rs
718
722
  - vendor/kreuzberg/tests/concurrency_stress.rs
723
+ - vendor/kreuzberg/tests/config_behavioral.rs
719
724
  - vendor/kreuzberg/tests/config_features.rs
720
725
  - vendor/kreuzberg/tests/config_integration_test.rs
721
726
  - vendor/kreuzberg/tests/config_loading_tests.rs
727
+ - vendor/kreuzberg/tests/contract_mcp.rs
722
728
  - vendor/kreuzberg/tests/core_integration.rs
723
729
  - vendor/kreuzberg/tests/csv_integration.rs
724
730
  - vendor/kreuzberg/tests/data/hierarchy_ground_truth.json
@@ -740,6 +746,7 @@ files:
740
746
  - vendor/kreuzberg/tests/keywords_quality.rs
741
747
  - vendor/kreuzberg/tests/latex_extractor_tests.rs
742
748
  - vendor/kreuzberg/tests/markdown_extractor_tests.rs
749
+ - vendor/kreuzberg/tests/mcp_integration.rs
743
750
  - vendor/kreuzberg/tests/mime_detection.rs
744
751
  - vendor/kreuzberg/tests/ocr_configuration.rs
745
752
  - vendor/kreuzberg/tests/ocr_errors.rs
@@ -766,6 +773,7 @@ files:
766
773
  - vendor/kreuzberg/tests/rst_extractor_tests.rs
767
774
  - vendor/kreuzberg/tests/rtf_extractor_tests.rs
768
775
  - vendor/kreuzberg/tests/security_validation.rs
776
+ - vendor/kreuzberg/tests/serialization_integration.rs
769
777
  - vendor/kreuzberg/tests/stopwords_integration_test.rs
770
778
  - vendor/kreuzberg/tests/test_fastembed.rs
771
779
  - vendor/kreuzberg/tests/typst_behavioral_tests.rs