kreuzberg 4.0.0.pre.rc.15 → 4.0.0.pre.rc.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +5 -0
  3. data/Gemfile.lock +2 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +1 -1
  5. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  6. data/ext/kreuzberg_rb/native/src/lib.rs +682 -9
  7. data/lib/kreuzberg/config.rb +111 -8
  8. data/lib/kreuzberg/error_context.rb +76 -0
  9. data/lib/kreuzberg/result.rb +78 -0
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +8 -0
  12. data/spec/binding/batch_spec.rb +374 -0
  13. data/spec/binding/config_result_spec.rb +377 -0
  14. data/spec/binding/config_validation_spec.rb +98 -0
  15. data/vendor/Cargo.toml +1 -1
  16. data/vendor/kreuzberg/Cargo.toml +15 -2
  17. data/vendor/kreuzberg/benches/token_reduction.rs +135 -0
  18. data/vendor/kreuzberg/src/chunking/mod.rs +464 -28
  19. data/vendor/kreuzberg/src/core/batch_optimizations.rs +304 -0
  20. data/vendor/kreuzberg/src/core/config_validation.rs +662 -0
  21. data/vendor/kreuzberg/src/core/extractor.rs +19 -2
  22. data/vendor/kreuzberg/src/core/formats.rs +251 -0
  23. data/vendor/kreuzberg/src/core/mod.rs +12 -0
  24. data/vendor/kreuzberg/src/core/pipeline.rs +103 -32
  25. data/vendor/kreuzberg/src/extraction/archive.rs +18 -6
  26. data/vendor/kreuzberg/src/extraction/docx.rs +7 -3
  27. data/vendor/kreuzberg/src/extraction/email.rs +15 -11
  28. data/vendor/kreuzberg/src/extraction/excel.rs +24 -5
  29. data/vendor/kreuzberg/src/extraction/html.rs +9 -1
  30. data/vendor/kreuzberg/src/extraction/markdown.rs +5 -2
  31. data/vendor/kreuzberg/src/extraction/pptx.rs +8 -6
  32. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  33. data/vendor/kreuzberg/src/extraction/table.rs +3 -1
  34. data/vendor/kreuzberg/src/extraction/text.rs +27 -10
  35. data/vendor/kreuzberg/src/extractors/html.rs +2 -1
  36. data/vendor/kreuzberg/src/extractors/pdf.rs +74 -42
  37. data/vendor/kreuzberg/src/lib.rs +2 -2
  38. data/vendor/kreuzberg/src/ocr/language_registry.rs +526 -0
  39. data/vendor/kreuzberg/src/ocr/mod.rs +2 -0
  40. data/vendor/kreuzberg/src/pdf/bindings.rs +202 -19
  41. data/vendor/kreuzberg/src/pdf/bundled.rs +12 -3
  42. data/vendor/kreuzberg/src/pdf/metadata.rs +8 -0
  43. data/vendor/kreuzberg/src/pdf/rendering.rs +4 -0
  44. data/vendor/kreuzberg/src/pdf/text.rs +164 -30
  45. data/vendor/kreuzberg/src/text/mod.rs +2 -0
  46. data/vendor/kreuzberg/src/text/quality_processor.rs +37 -12
  47. data/vendor/kreuzberg/src/text/string_utils.rs +27 -10
  48. data/vendor/kreuzberg/src/text/token_reduction/core.rs +37 -5
  49. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +24 -10
  50. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +2 -1
  51. data/vendor/kreuzberg/src/text/utf8_validation.rs +197 -0
  52. data/vendor/kreuzberg/src/types.rs +380 -6
  53. data/vendor/kreuzberg/src/utils/mod.rs +11 -0
  54. data/vendor/kreuzberg/src/utils/pool.rs +364 -0
  55. data/vendor/kreuzberg/src/utils/quality.rs +12 -3
  56. data/vendor/kreuzberg/src/utils/string_pool.rs +424 -0
  57. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +169 -0
  58. data/vendor/kreuzberg/tests/ocr_language_registry.rs +207 -0
  59. data/vendor/kreuzberg/tests/pipeline_integration.rs +3 -1
  60. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +17 -0
  61. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  62. metadata +13 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1ac94696cb48598d98ae55f75c69c59e1d248577b965a3921e21998ee33d2352
4
- data.tar.gz: 684e9f74a5f0d5c2c52677fec3cec493707b084dc77815396b237864dfeded90
3
+ metadata.gz: fe3add89c26722e26baf090f7b9a0c32671c449be6a34ea4285a5f6d15548b72
4
+ data.tar.gz: 49147ceab3fddc3161ff0df55f7c535134d63da7ce2577aad905c91179e875f3
5
5
  SHA512:
6
- metadata.gz: 6ed0b13217aad741e169850f155a28f921a37a41ffa95fb12a733798b49625f7a9db030eae90ddf00ee3e367b5a563a426fa301c8a16604c8ad5ca3ba78432fc
7
- data.tar.gz: 6c3acf2fb24f573a65e81fdac91f3735a6e2335c340d79a453d73fb43b63b807a8b9e93bbbba38a8c55550be72f0f503513b142238a1c3965e279d8ed522b3ae
6
+ metadata.gz: 5f2e0ab3d3dd4c975a99dcbf4a2e81347673eb74687034f8ef72cc3ece6561fbbed70811edc7363c911385c2d7c2eb0be2d8fa990872845458a3d3f5f019422c
7
+ data.tar.gz: 530bf825eb92e9a3df838ab14ec68277b17e575833ecdf0af11e32d8749e101e4fe68195841524c5e6b41c31a2330076e9da4507f5edf047d8670ad26c9dd928
data/.rubocop.yml CHANGED
@@ -52,6 +52,7 @@ Metrics/AbcSize:
52
52
  Exclude:
53
53
  - 'spec/**/*'
54
54
  - 'examples/**/*'
55
+ - 'lib/kreuzberg/config.rb'
55
56
 
56
57
  Naming/FileName:
57
58
  Enabled: true
@@ -99,6 +100,10 @@ Metrics/PerceivedComplexity:
99
100
  Exclude:
100
101
  - 'lib/kreuzberg/config.rb'
101
102
 
103
+ Metrics/ClassLength:
104
+ Exclude:
105
+ - 'lib/kreuzberg/config.rb'
106
+
102
107
  RSpec/RepeatedExampleGroupBody:
103
108
  Enabled: false
104
109
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.0.pre.rc.15)
4
+ kreuzberg (4.0.0.pre.rc.16)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -137,6 +137,7 @@ GEM
137
137
 
138
138
  PLATFORMS
139
139
  arm64-darwin-24
140
+ arm64-darwin-25
140
141
  x86_64-linux
141
142
 
142
143
  DEPENDENCIES
@@ -2354,7 +2354,7 @@ dependencies = [
2354
2354
 
2355
2355
  [[package]]
2356
2356
  name = "kreuzberg-rb"
2357
- version = "4.0.0-rc.15"
2357
+ version = "4.0.0-rc.16"
2358
2358
  dependencies = [
2359
2359
  "async-trait",
2360
2360
  "html-to-markdown-rs",
@@ -7,7 +7,7 @@ rb-sys = { path = "../../../vendor/rb-sys" }
7
7
 
8
8
  [package]
9
9
  name = "kreuzberg-rb"
10
- version = "4.0.0-rc.15"
10
+ version = "4.0.0-rc.16"
11
11
  edition = "2024"
12
12
  rust-version = "1.91"
13
13
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]