kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,314 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::Hierarchy do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.enabled).to be true
9
+ expect(config.k_clusters).to eq 6
10
+ expect(config.include_bbox).to be true
11
+ expect(config.ocr_coverage_threshold).to be_nil
12
+ end
13
+
14
+ it 'creates config with custom values' do
15
+ config = described_class.new(
16
+ enabled: false,
17
+ k_clusters: 10,
18
+ include_bbox: false,
19
+ ocr_coverage_threshold: 0.95
20
+ )
21
+
22
+ expect(config.enabled).to be false
23
+ expect(config.k_clusters).to eq 10
24
+ expect(config.include_bbox).to be false
25
+ expect(config.ocr_coverage_threshold).to eq 0.95
26
+ end
27
+
28
+ it 'converts k_clusters to integer' do
29
+ config = described_class.new(k_clusters: '8')
30
+
31
+ expect(config.k_clusters).to eq 8
32
+ expect(config.k_clusters).to be_a Integer
33
+ end
34
+
35
+ it 'converts enabled to boolean' do
36
+ config = described_class.new(enabled: 1)
37
+
38
+ expect(config.enabled).to be true
39
+ end
40
+
41
+ it 'converts include_bbox to boolean' do
42
+ config = described_class.new(include_bbox: false)
43
+
44
+ expect(config.include_bbox).to be false
45
+ end
46
+
47
+ it 'converts ocr_coverage_threshold to float' do
48
+ config = described_class.new(ocr_coverage_threshold: '0.85')
49
+
50
+ expect(config.ocr_coverage_threshold).to eq 0.85
51
+ expect(config.ocr_coverage_threshold).to be_a Float
52
+ end
53
+ end
54
+
55
+ describe '#to_h' do
56
+ it 'serializes to hash with all values' do
57
+ config = described_class.new(
58
+ enabled: true,
59
+ k_clusters: 8,
60
+ include_bbox: true
61
+ )
62
+ hash = config.to_h
63
+
64
+ expect(hash).to be_a Hash
65
+ expect(hash[:enabled]).to be true
66
+ expect(hash[:k_clusters]).to eq 8
67
+ expect(hash[:include_bbox]).to be true
68
+ end
69
+
70
+ it 'includes ocr_coverage_threshold when present' do
71
+ config = described_class.new(ocr_coverage_threshold: 0.9)
72
+ hash = config.to_h
73
+
74
+ expect(hash[:ocr_coverage_threshold]).to eq 0.9
75
+ end
76
+
77
+ it 'compacts nil values from hash' do
78
+ config = described_class.new(enabled: true)
79
+ hash = config.to_h
80
+
81
+ expect(hash.key?(:ocr_coverage_threshold)).to be false
82
+ end
83
+ end
84
+
85
+ describe '.from_h' do
86
+ it 'creates from hash' do
87
+ hash = { enabled: true, k_clusters: 8 }
88
+ config = described_class.from_h(hash)
89
+
90
+ expect(config).to be_a described_class
91
+ expect(config.enabled).to be true
92
+ expect(config.k_clusters).to eq 8
93
+ end
94
+
95
+ it 'returns nil for nil input' do
96
+ config = described_class.from_h(nil)
97
+
98
+ expect(config).to be_nil
99
+ end
100
+
101
+ it 'returns instance as-is' do
102
+ original = described_class.new(k_clusters: 10)
103
+ config = described_class.from_h(original)
104
+
105
+ expect(config).to be original
106
+ end
107
+
108
+ it 'converts symbol keys in hash' do
109
+ hash = { 'enabled' => true, 'k_clusters' => 8 }
110
+ config = described_class.from_h(hash)
111
+
112
+ expect(config.enabled).to be true
113
+ expect(config.k_clusters).to eq 8
114
+ end
115
+ end
116
+
117
+ describe 'validation' do
118
+ it 'accepts valid k_clusters' do
119
+ expect do
120
+ described_class.new(k_clusters: 5)
121
+ end.not_to raise_error
122
+ end
123
+
124
+ it 'accepts valid ocr_coverage_threshold' do
125
+ expect do
126
+ described_class.new(ocr_coverage_threshold: 0.8)
127
+ end.not_to raise_error
128
+ end
129
+
130
+ it 'accepts enabled true' do
131
+ expect do
132
+ described_class.new(enabled: true)
133
+ end.not_to raise_error
134
+ end
135
+ end
136
+
137
+ describe 'keyword arguments' do
138
+ it 'accepts all keyword arguments' do
139
+ config = described_class.new(
140
+ enabled: false,
141
+ k_clusters: 12,
142
+ include_bbox: false,
143
+ ocr_coverage_threshold: 0.75
144
+ )
145
+
146
+ expect(config.enabled).to be false
147
+ expect(config.k_clusters).to eq 12
148
+ expect(config.include_bbox).to be false
149
+ expect(config.ocr_coverage_threshold).to eq 0.75
150
+ end
151
+ end
152
+
153
+ describe 'equality' do
154
+ it 'compares configs by value' do
155
+ config1 = described_class.new(
156
+ enabled: true,
157
+ k_clusters: 8
158
+ )
159
+ config2 = described_class.new(
160
+ enabled: true,
161
+ k_clusters: 8
162
+ )
163
+
164
+ expect(config1.enabled).to eq config2.enabled
165
+ expect(config1.k_clusters).to eq config2.k_clusters
166
+ end
167
+
168
+ it 'detects differences in enabled' do
169
+ config1 = described_class.new(enabled: true)
170
+ config2 = described_class.new(enabled: false)
171
+
172
+ expect(config1.enabled).not_to eq config2.enabled
173
+ end
174
+
175
+ it 'detects differences in k_clusters' do
176
+ config1 = described_class.new(k_clusters: 6)
177
+ config2 = described_class.new(k_clusters: 10)
178
+
179
+ expect(config1.k_clusters).not_to eq config2.k_clusters
180
+ end
181
+
182
+ it 'detects differences in ocr_coverage_threshold' do
183
+ config1 = described_class.new(ocr_coverage_threshold: 0.8)
184
+ config2 = described_class.new(ocr_coverage_threshold: 0.9)
185
+
186
+ expect(config1.ocr_coverage_threshold).not_to eq config2.ocr_coverage_threshold
187
+ end
188
+ end
189
+
190
+ describe 'nested config integration' do
191
+ it 'can be nested in PDF config' do
192
+ hierarchy = described_class.new(k_clusters: 8, enabled: true)
193
+ pdf = Kreuzberg::Config::PDF.new(hierarchy: hierarchy)
194
+
195
+ expect(pdf.hierarchy).to be_a described_class
196
+ expect(pdf.hierarchy.k_clusters).to eq 8
197
+ expect(pdf.hierarchy.enabled).to be true
198
+ end
199
+
200
+ it 'accepts hash in PDF config' do
201
+ pdf = Kreuzberg::Config::PDF.new(
202
+ hierarchy: { enabled: true, k_clusters: 10 }
203
+ )
204
+
205
+ expect(pdf.hierarchy).to be_a described_class
206
+ expect(pdf.hierarchy.enabled).to be true
207
+ expect(pdf.hierarchy.k_clusters).to eq 10
208
+ end
209
+
210
+ it 'can be nested in Extraction config via PDF' do
211
+ extraction = Kreuzberg::Config::Extraction.new(
212
+ pdf_options: { hierarchy: { k_clusters: 8 } }
213
+ )
214
+
215
+ expect(extraction.pdf_options.hierarchy).to be_a described_class
216
+ expect(extraction.pdf_options.hierarchy.k_clusters).to eq 8
217
+ end
218
+ end
219
+
220
+ describe 'symbol vs string key handling' do
221
+ it 'converts symbol enabled to boolean' do
222
+ config = described_class.new(enabled: true)
223
+
224
+ expect(config.enabled).to be true
225
+ end
226
+
227
+ it 'converts k_clusters string to integer' do
228
+ config = described_class.new(k_clusters: '12')
229
+
230
+ expect(config.k_clusters).to eq 12
231
+ expect(config.k_clusters).to be_a Integer
232
+ end
233
+ end
234
+
235
+ describe 'boolean conversion' do
236
+ it 'converts truthy enabled to true' do
237
+ config = described_class.new(enabled: 1)
238
+
239
+ expect(config.enabled).to be true
240
+ end
241
+
242
+ it 'converts false enabled to false' do
243
+ config = described_class.new(enabled: false)
244
+
245
+ expect(config.enabled).to be false
246
+ end
247
+
248
+ it 'converts truthy include_bbox to true' do
249
+ config = described_class.new(include_bbox: 'yes')
250
+
251
+ expect(config.include_bbox).to be true
252
+ end
253
+
254
+ it 'converts false include_bbox to false' do
255
+ config = described_class.new(include_bbox: false)
256
+
257
+ expect(config.include_bbox).to be false
258
+ end
259
+ end
260
+
261
+ describe 'k_clusters parameter' do
262
+ it 'accepts small k_clusters' do
263
+ config = described_class.new(k_clusters: 3)
264
+
265
+ expect(config.k_clusters).to eq 3
266
+ end
267
+
268
+ it 'accepts large k_clusters' do
269
+ config = described_class.new(k_clusters: 20)
270
+
271
+ expect(config.k_clusters).to eq 20
272
+ end
273
+
274
+ it 'defaults to 6 clusters' do
275
+ config = described_class.new
276
+
277
+ expect(config.k_clusters).to eq 6
278
+ end
279
+
280
+ it 'converts string k_clusters to integer' do
281
+ config = described_class.new(k_clusters: '15')
282
+
283
+ expect(config.k_clusters).to eq 15
284
+ expect(config.k_clusters).to be_a Integer
285
+ end
286
+ end
287
+
288
+ describe 'ocr_coverage_threshold' do
289
+ it 'accepts high threshold values' do
290
+ config = described_class.new(ocr_coverage_threshold: 0.95)
291
+
292
+ expect(config.ocr_coverage_threshold).to eq 0.95
293
+ end
294
+
295
+ it 'accepts low threshold values' do
296
+ config = described_class.new(ocr_coverage_threshold: 0.1)
297
+
298
+ expect(config.ocr_coverage_threshold).to eq 0.1
299
+ end
300
+
301
+ it 'accepts nil for threshold' do
302
+ config = described_class.new(ocr_coverage_threshold: nil)
303
+
304
+ expect(config.ocr_coverage_threshold).to be_nil
305
+ end
306
+
307
+ it 'converts string threshold to float' do
308
+ config = described_class.new(ocr_coverage_threshold: '0.85')
309
+
310
+ expect(config.ocr_coverage_threshold).to eq 0.85
311
+ expect(config.ocr_coverage_threshold).to be_a Float
312
+ end
313
+ end
314
+ end
@@ -0,0 +1,209 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::ImageExtraction do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.extract_images).to be true
9
+ expect(config.target_dpi).to eq 300
10
+ expect(config.max_image_dimension).to eq 2000
11
+ expect(config.auto_adjust_dpi).to be true
12
+ expect(config.min_dpi).to eq 150
13
+ expect(config.max_dpi).to eq 600
14
+ end
15
+
16
+ it 'creates config with custom values' do
17
+ config = described_class.new(
18
+ extract_images: false,
19
+ target_dpi: 600,
20
+ max_image_dimension: 4000,
21
+ auto_adjust_dpi: false,
22
+ min_dpi: 100,
23
+ max_dpi: 1200
24
+ )
25
+
26
+ expect(config.extract_images).to be false
27
+ expect(config.target_dpi).to eq 600
28
+ expect(config.max_image_dimension).to eq 4000
29
+ expect(config.auto_adjust_dpi).to be false
30
+ expect(config.min_dpi).to eq 100
31
+ expect(config.max_dpi).to eq 1200
32
+ end
33
+
34
+ it 'converts values to integers' do
35
+ config = described_class.new(
36
+ target_dpi: '300',
37
+ max_image_dimension: '2000',
38
+ min_dpi: '150',
39
+ max_dpi: '600'
40
+ )
41
+
42
+ expect(config.target_dpi).to eq 300
43
+ expect(config.max_image_dimension).to eq 2000
44
+ expect(config.min_dpi).to eq 150
45
+ expect(config.max_dpi).to eq 600
46
+ expect(config.target_dpi).to be_a Integer
47
+ end
48
+
49
+ it 'converts boolean values correctly' do
50
+ config = described_class.new(
51
+ extract_images: true,
52
+ auto_adjust_dpi: false
53
+ )
54
+
55
+ expect(config.extract_images).to be true
56
+ expect(config.auto_adjust_dpi).to be false
57
+ end
58
+ end
59
+
60
+ describe '#to_h' do
61
+ it 'serializes to hash with all values' do
62
+ config = described_class.new(
63
+ target_dpi: 300,
64
+ max_image_dimension: 2000
65
+ )
66
+ hash = config.to_h
67
+
68
+ expect(hash).to be_a Hash
69
+ expect(hash[:extract_images]).to be true
70
+ expect(hash[:target_dpi]).to eq 300
71
+ expect(hash[:max_image_dimension]).to eq 2000
72
+ expect(hash[:auto_adjust_dpi]).to be true
73
+ expect(hash[:min_dpi]).to eq 150
74
+ expect(hash[:max_dpi]).to eq 600
75
+ end
76
+
77
+ it 'always includes all keys in hash' do
78
+ config = described_class.new
79
+ hash = config.to_h
80
+
81
+ expect(hash.keys).to contain_exactly(
82
+ :extract_images,
83
+ :target_dpi,
84
+ :max_image_dimension,
85
+ :auto_adjust_dpi,
86
+ :min_dpi,
87
+ :max_dpi
88
+ )
89
+ end
90
+ end
91
+
92
+ describe 'validation' do
93
+ it 'accepts valid DPI values' do
94
+ expect do
95
+ described_class.new(target_dpi: 300, min_dpi: 150, max_dpi: 600)
96
+ end.not_to raise_error
97
+ end
98
+
99
+ it 'accepts valid image dimensions' do
100
+ expect do
101
+ described_class.new(max_image_dimension: 4000)
102
+ end.not_to raise_error
103
+ end
104
+
105
+ it 'converts float DPI to integer' do
106
+ config = described_class.new(target_dpi: 300.5)
107
+
108
+ expect(config.target_dpi).to eq 300
109
+ expect(config.target_dpi).to be_a Integer
110
+ end
111
+ end
112
+
113
+ describe 'keyword arguments' do
114
+ it 'accepts all keyword arguments' do
115
+ config = described_class.new(
116
+ extract_images: true,
117
+ target_dpi: 600,
118
+ max_image_dimension: 3000,
119
+ auto_adjust_dpi: true,
120
+ min_dpi: 200,
121
+ max_dpi: 800
122
+ )
123
+
124
+ expect(config.extract_images).to be true
125
+ expect(config.target_dpi).to eq 600
126
+ expect(config.max_image_dimension).to eq 3000
127
+ expect(config.auto_adjust_dpi).to be true
128
+ expect(config.min_dpi).to eq 200
129
+ expect(config.max_dpi).to eq 800
130
+ end
131
+ end
132
+
133
+ describe 'equality' do
134
+ it 'compares configs by value' do
135
+ config1 = described_class.new(target_dpi: 300, max_image_dimension: 2000)
136
+ config2 = described_class.new(target_dpi: 300, max_image_dimension: 2000)
137
+
138
+ expect(config1.target_dpi).to eq config2.target_dpi
139
+ expect(config1.max_image_dimension).to eq config2.max_image_dimension
140
+ end
141
+
142
+ it 'detects differences in DPI' do
143
+ config1 = described_class.new(target_dpi: 300)
144
+ config2 = described_class.new(target_dpi: 600)
145
+
146
+ expect(config1.target_dpi).not_to eq config2.target_dpi
147
+ end
148
+
149
+ it 'detects differences in extract_images' do
150
+ config1 = described_class.new(extract_images: true)
151
+ config2 = described_class.new(extract_images: false)
152
+
153
+ expect(config1.extract_images).not_to eq config2.extract_images
154
+ end
155
+ end
156
+
157
+ describe 'nested config integration' do
158
+ it 'can be nested in Extraction config' do
159
+ image_config = described_class.new(target_dpi: 600)
160
+ extraction = Kreuzberg::Config::Extraction.new(image_extraction: image_config)
161
+
162
+ expect(extraction.image_extraction).to be_a described_class
163
+ expect(extraction.image_extraction.target_dpi).to eq 600
164
+ end
165
+
166
+ it 'accepts hash in Extraction config' do
167
+ extraction = Kreuzberg::Config::Extraction.new(
168
+ image_extraction: { target_dpi: 600, extract_images: true }
169
+ )
170
+
171
+ expect(extraction.image_extraction).to be_a described_class
172
+ expect(extraction.image_extraction.target_dpi).to eq 600
173
+ end
174
+ end
175
+
176
+ describe 'DPI range' do
177
+ it 'allows realistic DPI values' do
178
+ config = described_class.new(min_dpi: 150, max_dpi: 1200)
179
+
180
+ expect(config.min_dpi).to eq 150
181
+ expect(config.max_dpi).to eq 1200
182
+ end
183
+
184
+ it 'maintains DPI relationships' do
185
+ config = described_class.new(
186
+ target_dpi: 300,
187
+ min_dpi: 100,
188
+ max_dpi: 600
189
+ )
190
+
191
+ expect(config.min_dpi).to be <= config.target_dpi
192
+ expect(config.target_dpi).to be <= config.max_dpi
193
+ end
194
+ end
195
+
196
+ describe 'image dimension constraints' do
197
+ it 'accepts large image dimensions' do
198
+ config = described_class.new(max_image_dimension: 10_000)
199
+
200
+ expect(config.max_image_dimension).to eq 10_000
201
+ end
202
+
203
+ it 'accepts small image dimensions' do
204
+ config = described_class.new(max_image_dimension: 100)
205
+
206
+ expect(config.max_image_dimension).to eq 100
207
+ end
208
+ end
209
+ end