kreuzberg 4.0.0.pre.rc.15 → 4.0.0.pre.rc.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +5 -0
- data/Gemfile.lock +2 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +682 -9
- data/lib/kreuzberg/config.rb +111 -8
- data/lib/kreuzberg/error_context.rb +76 -0
- data/lib/kreuzberg/result.rb +78 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +8 -0
- data/spec/binding/batch_spec.rb +374 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_validation_spec.rb +98 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +15 -2
- data/vendor/kreuzberg/benches/token_reduction.rs +135 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +464 -28
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +304 -0
- data/vendor/kreuzberg/src/core/config_validation.rs +662 -0
- data/vendor/kreuzberg/src/core/extractor.rs +19 -2
- data/vendor/kreuzberg/src/core/formats.rs +251 -0
- data/vendor/kreuzberg/src/core/mod.rs +12 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +103 -32
- data/vendor/kreuzberg/src/extraction/archive.rs +18 -6
- data/vendor/kreuzberg/src/extraction/docx.rs +7 -3
- data/vendor/kreuzberg/src/extraction/email.rs +15 -11
- data/vendor/kreuzberg/src/extraction/excel.rs +24 -5
- data/vendor/kreuzberg/src/extraction/html.rs +9 -1
- data/vendor/kreuzberg/src/extraction/markdown.rs +5 -2
- data/vendor/kreuzberg/src/extraction/pptx.rs +8 -6
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/table.rs +3 -1
- data/vendor/kreuzberg/src/extraction/text.rs +27 -10
- data/vendor/kreuzberg/src/extractors/html.rs +2 -1
- data/vendor/kreuzberg/src/extractors/pdf.rs +74 -42
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/ocr/language_registry.rs +526 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +2 -0
- data/vendor/kreuzberg/src/pdf/bindings.rs +202 -19
- data/vendor/kreuzberg/src/pdf/bundled.rs +12 -3
- data/vendor/kreuzberg/src/pdf/metadata.rs +8 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +4 -0
- data/vendor/kreuzberg/src/pdf/text.rs +164 -30
- data/vendor/kreuzberg/src/text/mod.rs +2 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +37 -12
- data/vendor/kreuzberg/src/text/string_utils.rs +27 -10
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +37 -5
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +24 -10
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +2 -1
- data/vendor/kreuzberg/src/text/utf8_validation.rs +197 -0
- data/vendor/kreuzberg/src/types.rs +380 -6
- data/vendor/kreuzberg/src/utils/mod.rs +11 -0
- data/vendor/kreuzberg/src/utils/pool.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +12 -3
- data/vendor/kreuzberg/src/utils/string_pool.rs +424 -0
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +169 -0
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +207 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +3 -1
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +17 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +13 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fe3add89c26722e26baf090f7b9a0c32671c449be6a34ea4285a5f6d15548b72
|
|
4
|
+
data.tar.gz: 49147ceab3fddc3161ff0df55f7c535134d63da7ce2577aad905c91179e875f3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5f2e0ab3d3dd4c975a99dcbf4a2e81347673eb74687034f8ef72cc3ece6561fbbed70811edc7363c911385c2d7c2eb0be2d8fa990872845458a3d3f5f019422c
|
|
7
|
+
data.tar.gz: 530bf825eb92e9a3df838ab14ec68277b17e575833ecdf0af11e32d8749e101e4fe68195841524c5e6b41c31a2330076e9da4507f5edf047d8670ad26c9dd928
|
data/.rubocop.yml
CHANGED
|
@@ -52,6 +52,7 @@ Metrics/AbcSize:
|
|
|
52
52
|
Exclude:
|
|
53
53
|
- 'spec/**/*'
|
|
54
54
|
- 'examples/**/*'
|
|
55
|
+
- 'lib/kreuzberg/config.rb'
|
|
55
56
|
|
|
56
57
|
Naming/FileName:
|
|
57
58
|
Enabled: true
|
|
@@ -99,6 +100,10 @@ Metrics/PerceivedComplexity:
|
|
|
99
100
|
Exclude:
|
|
100
101
|
- 'lib/kreuzberg/config.rb'
|
|
101
102
|
|
|
103
|
+
Metrics/ClassLength:
|
|
104
|
+
Exclude:
|
|
105
|
+
- 'lib/kreuzberg/config.rb'
|
|
106
|
+
|
|
102
107
|
RSpec/RepeatedExampleGroupBody:
|
|
103
108
|
Enabled: false
|
|
104
109
|
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.0.0.pre.rc.
|
|
4
|
+
kreuzberg (4.0.0.pre.rc.16)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -137,6 +137,7 @@ GEM
|
|
|
137
137
|
|
|
138
138
|
PLATFORMS
|
|
139
139
|
arm64-darwin-24
|
|
140
|
+
arm64-darwin-25
|
|
140
141
|
x86_64-linux
|
|
141
142
|
|
|
142
143
|
DEPENDENCIES
|