kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Config::TokenReduction do
|
|
4
|
+
describe '#initialize' do
|
|
5
|
+
it 'creates config with default values' do
|
|
6
|
+
config = described_class.new
|
|
7
|
+
|
|
8
|
+
expect(config.mode).to eq 'off'
|
|
9
|
+
expect(config.preserve_important_words).to be true
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it 'creates config with custom mode' do
|
|
13
|
+
config = described_class.new(mode: 'light')
|
|
14
|
+
|
|
15
|
+
expect(config.mode).to eq 'light'
|
|
16
|
+
expect(config.preserve_important_words).to be true
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'creates config with custom preserve setting' do
|
|
20
|
+
config = described_class.new(
|
|
21
|
+
mode: 'aggressive',
|
|
22
|
+
preserve_important_words: false
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
expect(config.mode).to eq 'aggressive'
|
|
26
|
+
expect(config.preserve_important_words).to be false
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it 'converts mode symbol to string' do
|
|
30
|
+
config = described_class.new(mode: :moderate)
|
|
31
|
+
|
|
32
|
+
expect(config.mode).to eq 'moderate'
|
|
33
|
+
expect(config.mode).to be_a String
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it 'converts preserve_important_words to boolean' do
|
|
37
|
+
config = described_class.new(preserve_important_words: 1)
|
|
38
|
+
|
|
39
|
+
expect(config.preserve_important_words).to be true
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
describe '#to_h' do
|
|
44
|
+
it 'serializes to hash with all values' do
|
|
45
|
+
config = described_class.new(
|
|
46
|
+
mode: 'light',
|
|
47
|
+
preserve_important_words: false
|
|
48
|
+
)
|
|
49
|
+
hash = config.to_h
|
|
50
|
+
|
|
51
|
+
expect(hash).to be_a Hash
|
|
52
|
+
expect(hash[:mode]).to eq 'light'
|
|
53
|
+
expect(hash[:preserve_important_words]).to be false
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it 'always includes all keys in hash' do
|
|
57
|
+
config = described_class.new
|
|
58
|
+
hash = config.to_h
|
|
59
|
+
|
|
60
|
+
expect(hash.keys).to contain_exactly(:mode, :preserve_important_words)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
describe 'validation' do
|
|
65
|
+
it 'accepts off mode' do
|
|
66
|
+
expect do
|
|
67
|
+
described_class.new(mode: 'off')
|
|
68
|
+
end.not_to raise_error
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
it 'accepts light mode' do
|
|
72
|
+
expect do
|
|
73
|
+
described_class.new(mode: 'light')
|
|
74
|
+
end.not_to raise_error
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it 'accepts moderate mode' do
|
|
78
|
+
expect do
|
|
79
|
+
described_class.new(mode: 'moderate')
|
|
80
|
+
end.not_to raise_error
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
it 'accepts aggressive mode' do
|
|
84
|
+
expect do
|
|
85
|
+
described_class.new(mode: 'aggressive')
|
|
86
|
+
end.not_to raise_error
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
it 'accepts maximum mode' do
|
|
90
|
+
expect do
|
|
91
|
+
described_class.new(mode: 'maximum')
|
|
92
|
+
end.not_to raise_error
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
it 'rejects invalid mode' do
|
|
96
|
+
expect do
|
|
97
|
+
described_class.new(mode: 'invalid_mode')
|
|
98
|
+
end.to raise_error ArgumentError, /Invalid token reduction mode/
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
it 'lists valid modes in error message' do
|
|
102
|
+
expect do
|
|
103
|
+
described_class.new(mode: 'unknown')
|
|
104
|
+
end.to raise_error ArgumentError, /off.*light.*moderate.*aggressive.*maximum/
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
describe 'keyword arguments' do
|
|
109
|
+
it 'accepts all keyword arguments' do
|
|
110
|
+
config = described_class.new(
|
|
111
|
+
mode: 'maximum',
|
|
112
|
+
preserve_important_words: false
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
expect(config.mode).to eq 'maximum'
|
|
116
|
+
expect(config.preserve_important_words).to be false
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
describe 'equality' do
|
|
121
|
+
it 'compares configs by value' do
|
|
122
|
+
config1 = described_class.new(mode: 'light', preserve_important_words: true)
|
|
123
|
+
config2 = described_class.new(mode: 'light', preserve_important_words: true)
|
|
124
|
+
|
|
125
|
+
expect(config1.mode).to eq config2.mode
|
|
126
|
+
expect(config1.preserve_important_words).to eq config2.preserve_important_words
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
it 'detects differences in mode' do
|
|
130
|
+
config1 = described_class.new(mode: 'light')
|
|
131
|
+
config2 = described_class.new(mode: 'aggressive')
|
|
132
|
+
|
|
133
|
+
expect(config1.mode).not_to eq config2.mode
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
it 'detects differences in preserve_important_words' do
|
|
137
|
+
config1 = described_class.new(preserve_important_words: true)
|
|
138
|
+
config2 = described_class.new(preserve_important_words: false)
|
|
139
|
+
|
|
140
|
+
expect(config1.preserve_important_words).not_to eq config2.preserve_important_words
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
describe 'nested config integration' do
|
|
145
|
+
it 'can be nested in Extraction config' do
|
|
146
|
+
token_reduction = described_class.new(mode: 'light')
|
|
147
|
+
extraction = Kreuzberg::Config::Extraction.new(token_reduction: token_reduction)
|
|
148
|
+
|
|
149
|
+
expect(extraction.token_reduction).to be_a described_class
|
|
150
|
+
expect(extraction.token_reduction.mode).to eq 'light'
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it 'accepts hash in Extraction config' do
|
|
154
|
+
extraction = Kreuzberg::Config::Extraction.new(
|
|
155
|
+
token_reduction: { mode: 'moderate', preserve_important_words: true }
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
expect(extraction.token_reduction).to be_a described_class
|
|
159
|
+
expect(extraction.token_reduction.mode).to eq 'moderate'
|
|
160
|
+
expect(extraction.token_reduction.preserve_important_words).to be true
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
describe 'symbol vs string key handling' do
|
|
165
|
+
it 'converts symbol mode to string' do
|
|
166
|
+
config = described_class.new(mode: :aggressive)
|
|
167
|
+
|
|
168
|
+
expect(config.mode).to eq 'aggressive'
|
|
169
|
+
expect(config.mode).to be_a String
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it 'accepts string mode' do
|
|
173
|
+
config = described_class.new(mode: 'light')
|
|
174
|
+
|
|
175
|
+
expect(config.mode).to eq 'light'
|
|
176
|
+
expect(config.mode).to be_a String
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
describe 'boolean conversion' do
|
|
181
|
+
it 'converts truthy preserve_important_words to true' do
|
|
182
|
+
config = described_class.new(preserve_important_words: 1)
|
|
183
|
+
|
|
184
|
+
expect(config.preserve_important_words).to be true
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
it 'converts false preserve_important_words to false' do
|
|
188
|
+
config = described_class.new(preserve_important_words: false)
|
|
189
|
+
|
|
190
|
+
expect(config.preserve_important_words).to be false
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
it 'converts string yes to true' do
|
|
194
|
+
config = described_class.new(preserve_important_words: 'yes')
|
|
195
|
+
|
|
196
|
+
expect(config.preserve_important_words).to be true
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
describe 'reduction modes' do
|
|
201
|
+
it 'off mode disables reduction' do
|
|
202
|
+
config = described_class.new(mode: 'off')
|
|
203
|
+
|
|
204
|
+
expect(config.mode).to eq 'off'
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
it 'light mode provides light reduction' do
|
|
208
|
+
config = described_class.new(mode: 'light')
|
|
209
|
+
|
|
210
|
+
expect(config.mode).to eq 'light'
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
it 'moderate mode provides balanced reduction' do
|
|
214
|
+
config = described_class.new(mode: 'moderate')
|
|
215
|
+
|
|
216
|
+
expect(config.mode).to eq 'moderate'
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
it 'aggressive mode reduces more tokens' do
|
|
220
|
+
config = described_class.new(mode: 'aggressive')
|
|
221
|
+
|
|
222
|
+
expect(config.mode).to eq 'aggressive'
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
it 'maximum mode is most aggressive' do
|
|
226
|
+
config = described_class.new(mode: 'maximum')
|
|
227
|
+
|
|
228
|
+
expect(config.mode).to eq 'maximum'
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
describe 'default behavior' do
|
|
233
|
+
it 'defaults to off mode for safety' do
|
|
234
|
+
config = described_class.new
|
|
235
|
+
|
|
236
|
+
expect(config.mode).to eq 'off'
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
it 'defaults to preserving important words' do
|
|
240
|
+
config = described_class.new
|
|
241
|
+
|
|
242
|
+
expect(config.preserve_important_words).to be true
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
it 'can enable reduction with light mode' do
|
|
246
|
+
config = described_class.new(mode: 'light')
|
|
247
|
+
|
|
248
|
+
expect(config.mode).to eq 'light'
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
end
|