kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,290 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::PostProcessor do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.enabled).to be true
9
+ expect(config.enabled_processors).to be_nil
10
+ expect(config.disabled_processors).to be_nil
11
+ end
12
+
13
+ it 'creates config with enabled true' do
14
+ config = described_class.new(enabled: true)
15
+
16
+ expect(config.enabled).to be true
17
+ end
18
+
19
+ it 'creates config with enabled false' do
20
+ config = described_class.new(enabled: false)
21
+
22
+ expect(config.enabled).to be false
23
+ end
24
+
25
+ it 'creates config with enabled_processors list' do
26
+ config = described_class.new(
27
+ enabled: true,
28
+ enabled_processors: %w[quality formatting]
29
+ )
30
+
31
+ expect(config.enabled_processors).to eq %w[quality formatting]
32
+ end
33
+
34
+ it 'creates config with disabled_processors list' do
35
+ config = described_class.new(
36
+ enabled: true,
37
+ disabled_processors: %w[token_reduction]
38
+ )
39
+
40
+ expect(config.disabled_processors).to eq %w[token_reduction]
41
+ end
42
+
43
+ it 'converts enabled_processors to strings' do
44
+ config = described_class.new(enabled_processors: %i[quality formatting])
45
+
46
+ expect(config.enabled_processors).to eq %w[quality formatting]
47
+ expect(config.enabled_processors.all?(String)).to be true
48
+ end
49
+
50
+ it 'converts disabled_processors to strings' do
51
+ config = described_class.new(disabled_processors: [:quality])
52
+
53
+ expect(config.disabled_processors).to eq %w[quality]
54
+ expect(config.disabled_processors.all?(String)).to be true
55
+ end
56
+
57
+ it 'converts enabled to boolean' do
58
+ config = described_class.new(enabled: 1)
59
+
60
+ expect(config.enabled).to be true
61
+ end
62
+ end
63
+
64
+ describe '#to_h' do
65
+ it 'serializes to hash with default values' do
66
+ config = described_class.new
67
+ hash = config.to_h
68
+
69
+ expect(hash).to be_a Hash
70
+ expect(hash[:enabled]).to be true
71
+ end
72
+
73
+ it 'includes enabled_processors in hash when present' do
74
+ config = described_class.new(enabled_processors: %w[quality])
75
+ hash = config.to_h
76
+
77
+ expect(hash[:enabled_processors]).to eq %w[quality]
78
+ end
79
+
80
+ it 'includes disabled_processors in hash when present' do
81
+ config = described_class.new(disabled_processors: %w[token_reduction])
82
+ hash = config.to_h
83
+
84
+ expect(hash[:disabled_processors]).to eq %w[token_reduction]
85
+ end
86
+
87
+ it 'compacts nil values from hash' do
88
+ config = described_class.new(enabled: true)
89
+ hash = config.to_h
90
+
91
+ expect(hash.key?(:enabled_processors)).to be false
92
+ expect(hash.key?(:disabled_processors)).to be false
93
+ end
94
+ end
95
+
96
+ describe 'validation' do
97
+ it 'accepts enabled true' do
98
+ expect do
99
+ described_class.new(enabled: true)
100
+ end.not_to raise_error
101
+ end
102
+
103
+ it 'accepts enabled false' do
104
+ expect do
105
+ described_class.new(enabled: false)
106
+ end.not_to raise_error
107
+ end
108
+
109
+ it 'accepts enabled_processors list' do
110
+ expect do
111
+ described_class.new(enabled_processors: %w[quality formatting])
112
+ end.not_to raise_error
113
+ end
114
+
115
+ it 'accepts disabled_processors list' do
116
+ expect do
117
+ described_class.new(disabled_processors: %w[token_reduction])
118
+ end.not_to raise_error
119
+ end
120
+
121
+ it 'accepts both enabled and disabled processors' do
122
+ expect do
123
+ described_class.new(
124
+ enabled_processors: %w[quality],
125
+ disabled_processors: %w[formatting]
126
+ )
127
+ end.not_to raise_error
128
+ end
129
+ end
130
+
131
+ describe 'keyword arguments' do
132
+ it 'accepts all keyword arguments' do
133
+ config = described_class.new(
134
+ enabled: true,
135
+ enabled_processors: %w[quality],
136
+ disabled_processors: %w[token_reduction]
137
+ )
138
+
139
+ expect(config.enabled).to be true
140
+ expect(config.enabled_processors).to eq %w[quality]
141
+ expect(config.disabled_processors).to eq %w[token_reduction]
142
+ end
143
+ end
144
+
145
+ describe 'equality' do
146
+ it 'compares configs by value' do
147
+ config1 = described_class.new(
148
+ enabled: true,
149
+ enabled_processors: %w[quality]
150
+ )
151
+ config2 = described_class.new(
152
+ enabled: true,
153
+ enabled_processors: %w[quality]
154
+ )
155
+
156
+ expect(config1.enabled).to eq config2.enabled
157
+ expect(config1.enabled_processors).to eq config2.enabled_processors
158
+ end
159
+
160
+ it 'detects differences in enabled' do
161
+ config1 = described_class.new(enabled: true)
162
+ config2 = described_class.new(enabled: false)
163
+
164
+ expect(config1.enabled).not_to eq config2.enabled
165
+ end
166
+
167
+ it 'detects differences in enabled_processors' do
168
+ config1 = described_class.new(enabled_processors: %w[quality])
169
+ config2 = described_class.new(enabled_processors: %w[formatting])
170
+
171
+ expect(config1.enabled_processors).not_to eq config2.enabled_processors
172
+ end
173
+ end
174
+
175
+ describe 'nested config integration' do
176
+ it 'can be nested in Extraction config' do
177
+ postprocessor = described_class.new(
178
+ enabled: true,
179
+ enabled_processors: %w[quality]
180
+ )
181
+ extraction = Kreuzberg::Config::Extraction.new(postprocessor: postprocessor)
182
+
183
+ expect(extraction.postprocessor).to be_a described_class
184
+ expect(extraction.postprocessor.enabled).to be true
185
+ expect(extraction.postprocessor.enabled_processors).to eq %w[quality]
186
+ end
187
+
188
+ it 'accepts hash in Extraction config' do
189
+ extraction = Kreuzberg::Config::Extraction.new(
190
+ postprocessor: {
191
+ enabled: true,
192
+ enabled_processors: %w[quality formatting]
193
+ }
194
+ )
195
+
196
+ expect(extraction.postprocessor).to be_a described_class
197
+ expect(extraction.postprocessor.enabled).to be true
198
+ expect(extraction.postprocessor.enabled_processors).to eq %w[quality formatting]
199
+ end
200
+ end
201
+
202
+ describe 'symbol vs string key handling' do
203
+ it 'converts symbol enabled_processors to strings' do
204
+ config = described_class.new(enabled_processors: %i[quality formatting])
205
+
206
+ expect(config.enabled_processors).to eq %w[quality formatting]
207
+ expect(config.enabled_processors.all?(String)).to be true
208
+ end
209
+
210
+ it 'converts symbol disabled_processors to strings' do
211
+ config = described_class.new(disabled_processors: [:token_reduction])
212
+
213
+ expect(config.disabled_processors).to eq %w[token_reduction]
214
+ expect(config.disabled_processors.all?(String)).to be true
215
+ end
216
+ end
217
+
218
+ describe 'processor lists' do
219
+ it 'stores empty enabled_processors list' do
220
+ config = described_class.new(enabled_processors: [])
221
+
222
+ expect(config.enabled_processors).to eq []
223
+ end
224
+
225
+ it 'stores single enabled_processor' do
226
+ config = described_class.new(enabled_processors: %w[quality])
227
+
228
+ expect(config.enabled_processors).to eq %w[quality]
229
+ end
230
+
231
+ it 'stores multiple enabled_processors' do
232
+ processors = %w[quality formatting cleanup]
233
+ config = described_class.new(enabled_processors: processors)
234
+
235
+ expect(config.enabled_processors).to eq processors
236
+ end
237
+
238
+ it 'stores multiple disabled_processors' do
239
+ processors = %w[token_reduction duplicate_removal]
240
+ config = described_class.new(disabled_processors: processors)
241
+
242
+ expect(config.disabled_processors).to eq processors
243
+ end
244
+ end
245
+
246
+ describe 'boolean conversion' do
247
+ it 'converts truthy enabled to true' do
248
+ config = described_class.new(enabled: 1)
249
+
250
+ expect(config.enabled).to be true
251
+ end
252
+
253
+ it 'converts false enabled to false' do
254
+ config = described_class.new(enabled: false)
255
+
256
+ expect(config.enabled).to be false
257
+ end
258
+
259
+ it 'converts string true to true' do
260
+ config = described_class.new(enabled: 'yes')
261
+
262
+ expect(config.enabled).to be true
263
+ end
264
+ end
265
+
266
+ describe 'default behavior' do
267
+ it 'defaults to enabled' do
268
+ config = described_class.new
269
+
270
+ expect(config.enabled).to be true
271
+ end
272
+
273
+ it 'defaults to no specific processors' do
274
+ config = described_class.new
275
+
276
+ expect(config.enabled_processors).to be_nil
277
+ expect(config.disabled_processors).to be_nil
278
+ end
279
+
280
+ it 'allows disabling while specifying processors' do
281
+ config = described_class.new(
282
+ enabled: false,
283
+ enabled_processors: %w[quality]
284
+ )
285
+
286
+ expect(config.enabled).to be false
287
+ expect(config.enabled_processors).to eq %w[quality]
288
+ end
289
+ end
290
+ end
@@ -0,0 +1,181 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::Tesseract do
4
+ describe '#initialize' do
5
+ it 'creates config with no arguments' do
6
+ config = described_class.new
7
+
8
+ expect(config).to be_a described_class
9
+ expect(config.options).to eq({})
10
+ end
11
+
12
+ it 'creates config with custom options' do
13
+ config = described_class.new(dpi: 300, psm: 3)
14
+
15
+ expect(config.options[:dpi]).to eq 300
16
+ expect(config.options[:psm]).to eq 3
17
+ end
18
+
19
+ it 'converts string keys to symbols' do
20
+ config = described_class.new('oem' => 1, 'lang' => 'eng')
21
+
22
+ expect(config.options[:oem]).to eq 1
23
+ expect(config.options[:lang]).to eq 'eng'
24
+ end
25
+
26
+ it 'accepts preprocessing as hash' do
27
+ config = described_class.new(preprocessing: { target_dpi: 300 })
28
+
29
+ expect(config.options[:preprocessing]).to be_a Kreuzberg::Config::ImagePreprocessing
30
+ end
31
+
32
+ it 'accepts preprocessing as instance' do
33
+ preprocessing = Kreuzberg::Config::ImagePreprocessing.new(target_dpi: 600)
34
+ config = described_class.new(preprocessing: preprocessing)
35
+
36
+ expect(config.options[:preprocessing]).to be_a Kreuzberg::Config::ImagePreprocessing
37
+ expect(config.options[:preprocessing].target_dpi).to eq 600
38
+ end
39
+ end
40
+
41
+ describe '#to_h' do
42
+ it 'returns options as hash' do
43
+ config = described_class.new(dpi: 300, psm: 3)
44
+ hash = config.to_h
45
+
46
+ expect(hash).to be_a Hash
47
+ expect(hash[:dpi]).to eq 300
48
+ expect(hash[:psm]).to eq 3
49
+ end
50
+
51
+ it 'includes nested preprocessing in hash' do
52
+ config = described_class.new(
53
+ preprocessing: { target_dpi: 300, denoise: true }
54
+ )
55
+ hash = config.to_h
56
+
57
+ expect(hash[:preprocessing]).to be_a Kreuzberg::Config::ImagePreprocessing
58
+ # Access the config object's attributes
59
+ expect(hash[:preprocessing].target_dpi).to eq 300
60
+ expect(hash[:preprocessing].denoise).to be true
61
+ end
62
+
63
+ it 'returns duplicate hash not original' do
64
+ config = described_class.new(value: 'test')
65
+ hash1 = config.to_h
66
+ hash2 = config.to_h
67
+
68
+ expect(hash1).to eq hash2
69
+ expect(hash1).not_to be hash2
70
+ end
71
+ end
72
+
73
+ describe 'validation' do
74
+ it 'rejects invalid preprocessing type' do
75
+ expect do
76
+ described_class.new(preprocessing: 'invalid')
77
+ end.to raise_error ArgumentError, /preprocessing must be.*ImagePreprocessing.*Hash/
78
+ end
79
+
80
+ it 'accepts valid preprocessing hash' do
81
+ expect do
82
+ described_class.new(preprocessing: { target_dpi: 300 })
83
+ end.not_to raise_error
84
+ end
85
+
86
+ it 'accepts valid preprocessing instance' do
87
+ preprocessing = Kreuzberg::Config::ImagePreprocessing.new
88
+ expect do
89
+ described_class.new(preprocessing: preprocessing)
90
+ end.not_to raise_error
91
+ end
92
+ end
93
+
94
+ describe 'keyword arguments' do
95
+ it 'accepts arbitrary keyword arguments' do
96
+ config = described_class.new(
97
+ dpi: 300,
98
+ psm: 3,
99
+ oem: 1,
100
+ custom_option: 'value'
101
+ )
102
+
103
+ expect(config.options[:dpi]).to eq 300
104
+ expect(config.options[:psm]).to eq 3
105
+ expect(config.options[:oem]).to eq 1
106
+ expect(config.options[:custom_option]).to eq 'value'
107
+ end
108
+
109
+ it 'stores all options with symbol keys' do
110
+ config = described_class.new(foo: 'bar', baz: 42)
111
+
112
+ expect(config.options.keys).to all be_a Symbol
113
+ expect(config.options[:foo]).to eq 'bar'
114
+ expect(config.options[:baz]).to eq 42
115
+ end
116
+ end
117
+
118
+ describe 'equality' do
119
+ it 'compares configs by options value' do
120
+ config1 = described_class.new(dpi: 300, psm: 3)
121
+ config2 = described_class.new(dpi: 300, psm: 3)
122
+
123
+ expect(config1.options).to eq config2.options
124
+ end
125
+
126
+ it 'detects differences in options' do
127
+ config1 = described_class.new(dpi: 300)
128
+ config2 = described_class.new(dpi: 600)
129
+
130
+ expect(config1.options).not_to eq config2.options
131
+ end
132
+ end
133
+
134
+ describe 'nested config integration' do
135
+ it 'can be nested in OCR config' do
136
+ tesseract = described_class.new(dpi: 300, psm: 3)
137
+ ocr = Kreuzberg::Config::OCR.new(tesseract_config: tesseract)
138
+
139
+ expect(ocr.tesseract_config).to be_a described_class
140
+ expect(ocr.tesseract_config.options[:dpi]).to eq 300
141
+ end
142
+
143
+ it 'accepts preprocessing nested in tesseract' do
144
+ preprocessing_data = { target_dpi: 600, denoise: true }
145
+ tesseract = described_class.new(preprocessing: preprocessing_data)
146
+
147
+ expect(tesseract.options[:preprocessing]).to be_a Kreuzberg::Config::ImagePreprocessing
148
+ expect(tesseract.options[:preprocessing].denoise).to be true
149
+ end
150
+ end
151
+
152
+ describe 'symbol vs string key handling' do
153
+ it 'normalizes all keys to symbols' do
154
+ config = described_class.new(
155
+ 'string_key' => 'value1',
156
+ symbol_key: 'value2'
157
+ )
158
+
159
+ expect(config.options.keys).to all be_a Symbol
160
+ expect(config.options[:string_key]).to eq 'value1'
161
+ expect(config.options[:symbol_key]).to eq 'value2'
162
+ end
163
+
164
+ it 'preserves string values while converting keys to symbols' do
165
+ config = described_class.new('test_key' => 'test_value')
166
+
167
+ expect(config.options[:test_key]).to eq 'test_value'
168
+ expect(config.options[:test_key]).to be_a String
169
+ end
170
+ end
171
+
172
+ describe 'immutability concerns' do
173
+ it 'stores options but does not freeze them by default' do
174
+ config = described_class.new(value: 'test')
175
+
176
+ # The config itself can be modified by re-assigning instance variables
177
+ # This is a design choice that allows for mutability
178
+ expect(config.options).to respond_to(:merge)
179
+ end
180
+ end
181
+ end