kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,251 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::TokenReduction do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.mode).to eq 'off'
9
+ expect(config.preserve_important_words).to be true
10
+ end
11
+
12
+ it 'creates config with custom mode' do
13
+ config = described_class.new(mode: 'light')
14
+
15
+ expect(config.mode).to eq 'light'
16
+ expect(config.preserve_important_words).to be true
17
+ end
18
+
19
+ it 'creates config with custom preserve setting' do
20
+ config = described_class.new(
21
+ mode: 'aggressive',
22
+ preserve_important_words: false
23
+ )
24
+
25
+ expect(config.mode).to eq 'aggressive'
26
+ expect(config.preserve_important_words).to be false
27
+ end
28
+
29
+ it 'converts mode symbol to string' do
30
+ config = described_class.new(mode: :moderate)
31
+
32
+ expect(config.mode).to eq 'moderate'
33
+ expect(config.mode).to be_a String
34
+ end
35
+
36
+ it 'converts preserve_important_words to boolean' do
37
+ config = described_class.new(preserve_important_words: 1)
38
+
39
+ expect(config.preserve_important_words).to be true
40
+ end
41
+ end
42
+
43
+ describe '#to_h' do
44
+ it 'serializes to hash with all values' do
45
+ config = described_class.new(
46
+ mode: 'light',
47
+ preserve_important_words: false
48
+ )
49
+ hash = config.to_h
50
+
51
+ expect(hash).to be_a Hash
52
+ expect(hash[:mode]).to eq 'light'
53
+ expect(hash[:preserve_important_words]).to be false
54
+ end
55
+
56
+ it 'always includes all keys in hash' do
57
+ config = described_class.new
58
+ hash = config.to_h
59
+
60
+ expect(hash.keys).to contain_exactly(:mode, :preserve_important_words)
61
+ end
62
+ end
63
+
64
+ describe 'validation' do
65
+ it 'accepts off mode' do
66
+ expect do
67
+ described_class.new(mode: 'off')
68
+ end.not_to raise_error
69
+ end
70
+
71
+ it 'accepts light mode' do
72
+ expect do
73
+ described_class.new(mode: 'light')
74
+ end.not_to raise_error
75
+ end
76
+
77
+ it 'accepts moderate mode' do
78
+ expect do
79
+ described_class.new(mode: 'moderate')
80
+ end.not_to raise_error
81
+ end
82
+
83
+ it 'accepts aggressive mode' do
84
+ expect do
85
+ described_class.new(mode: 'aggressive')
86
+ end.not_to raise_error
87
+ end
88
+
89
+ it 'accepts maximum mode' do
90
+ expect do
91
+ described_class.new(mode: 'maximum')
92
+ end.not_to raise_error
93
+ end
94
+
95
+ it 'rejects invalid mode' do
96
+ expect do
97
+ described_class.new(mode: 'invalid_mode')
98
+ end.to raise_error ArgumentError, /Invalid token reduction mode/
99
+ end
100
+
101
+ it 'lists valid modes in error message' do
102
+ expect do
103
+ described_class.new(mode: 'unknown')
104
+ end.to raise_error ArgumentError, /off.*light.*moderate.*aggressive.*maximum/
105
+ end
106
+ end
107
+
108
+ describe 'keyword arguments' do
109
+ it 'accepts all keyword arguments' do
110
+ config = described_class.new(
111
+ mode: 'maximum',
112
+ preserve_important_words: false
113
+ )
114
+
115
+ expect(config.mode).to eq 'maximum'
116
+ expect(config.preserve_important_words).to be false
117
+ end
118
+ end
119
+
120
+ describe 'equality' do
121
+ it 'compares configs by value' do
122
+ config1 = described_class.new(mode: 'light', preserve_important_words: true)
123
+ config2 = described_class.new(mode: 'light', preserve_important_words: true)
124
+
125
+ expect(config1.mode).to eq config2.mode
126
+ expect(config1.preserve_important_words).to eq config2.preserve_important_words
127
+ end
128
+
129
+ it 'detects differences in mode' do
130
+ config1 = described_class.new(mode: 'light')
131
+ config2 = described_class.new(mode: 'aggressive')
132
+
133
+ expect(config1.mode).not_to eq config2.mode
134
+ end
135
+
136
+ it 'detects differences in preserve_important_words' do
137
+ config1 = described_class.new(preserve_important_words: true)
138
+ config2 = described_class.new(preserve_important_words: false)
139
+
140
+ expect(config1.preserve_important_words).not_to eq config2.preserve_important_words
141
+ end
142
+ end
143
+
144
+ describe 'nested config integration' do
145
+ it 'can be nested in Extraction config' do
146
+ token_reduction = described_class.new(mode: 'light')
147
+ extraction = Kreuzberg::Config::Extraction.new(token_reduction: token_reduction)
148
+
149
+ expect(extraction.token_reduction).to be_a described_class
150
+ expect(extraction.token_reduction.mode).to eq 'light'
151
+ end
152
+
153
+ it 'accepts hash in Extraction config' do
154
+ extraction = Kreuzberg::Config::Extraction.new(
155
+ token_reduction: { mode: 'moderate', preserve_important_words: true }
156
+ )
157
+
158
+ expect(extraction.token_reduction).to be_a described_class
159
+ expect(extraction.token_reduction.mode).to eq 'moderate'
160
+ expect(extraction.token_reduction.preserve_important_words).to be true
161
+ end
162
+ end
163
+
164
+ describe 'symbol vs string key handling' do
165
+ it 'converts symbol mode to string' do
166
+ config = described_class.new(mode: :aggressive)
167
+
168
+ expect(config.mode).to eq 'aggressive'
169
+ expect(config.mode).to be_a String
170
+ end
171
+
172
+ it 'accepts string mode' do
173
+ config = described_class.new(mode: 'light')
174
+
175
+ expect(config.mode).to eq 'light'
176
+ expect(config.mode).to be_a String
177
+ end
178
+ end
179
+
180
+ describe 'boolean conversion' do
181
+ it 'converts truthy preserve_important_words to true' do
182
+ config = described_class.new(preserve_important_words: 1)
183
+
184
+ expect(config.preserve_important_words).to be true
185
+ end
186
+
187
+ it 'converts false preserve_important_words to false' do
188
+ config = described_class.new(preserve_important_words: false)
189
+
190
+ expect(config.preserve_important_words).to be false
191
+ end
192
+
193
+ it 'converts string yes to true' do
194
+ config = described_class.new(preserve_important_words: 'yes')
195
+
196
+ expect(config.preserve_important_words).to be true
197
+ end
198
+ end
199
+
200
+ describe 'reduction modes' do
201
+ it 'off mode disables reduction' do
202
+ config = described_class.new(mode: 'off')
203
+
204
+ expect(config.mode).to eq 'off'
205
+ end
206
+
207
+ it 'light mode provides light reduction' do
208
+ config = described_class.new(mode: 'light')
209
+
210
+ expect(config.mode).to eq 'light'
211
+ end
212
+
213
+ it 'moderate mode provides balanced reduction' do
214
+ config = described_class.new(mode: 'moderate')
215
+
216
+ expect(config.mode).to eq 'moderate'
217
+ end
218
+
219
+ it 'aggressive mode reduces more tokens' do
220
+ config = described_class.new(mode: 'aggressive')
221
+
222
+ expect(config.mode).to eq 'aggressive'
223
+ end
224
+
225
+ it 'maximum mode is most aggressive' do
226
+ config = described_class.new(mode: 'maximum')
227
+
228
+ expect(config.mode).to eq 'maximum'
229
+ end
230
+ end
231
+
232
+ describe 'default behavior' do
233
+ it 'defaults to off mode for safety' do
234
+ config = described_class.new
235
+
236
+ expect(config.mode).to eq 'off'
237
+ end
238
+
239
+ it 'defaults to preserving important words' do
240
+ config = described_class.new
241
+
242
+ expect(config.preserve_important_words).to be true
243
+ end
244
+
245
+ it 'can enable reduction with light mode' do
246
+ config = described_class.new(mode: 'light')
247
+
248
+ expect(config.mode).to eq 'light'
249
+ end
250
+ end
251
+ end