kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,221 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::PageConfig do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.extract_pages).to be false
9
+ expect(config.insert_page_markers).to be false
10
+ expect(config.marker_format).to eq "\n\n<!-- PAGE {page_num} -->\n\n"
11
+ end
12
+
13
+ it 'creates config with custom values' do
14
+ config = described_class.new(
15
+ extract_pages: true,
16
+ insert_page_markers: true,
17
+ marker_format: '--- PAGE {page_num} ---'
18
+ )
19
+
20
+ expect(config.extract_pages).to be true
21
+ expect(config.insert_page_markers).to be true
22
+ expect(config.marker_format).to eq '--- PAGE {page_num} ---'
23
+ end
24
+
25
+ it 'converts boolean values' do
26
+ config = described_class.new(
27
+ extract_pages: true,
28
+ insert_page_markers: false
29
+ )
30
+
31
+ expect(config.extract_pages).to be true
32
+ expect(config.insert_page_markers).to be false
33
+ end
34
+
35
+ it 'converts marker_format to string' do
36
+ config = described_class.new(marker_format: :default)
37
+
38
+ expect(config.marker_format).to be_a String
39
+ end
40
+ end
41
+
42
+ describe '#to_h' do
43
+ it 'serializes to hash with all values' do
44
+ config = described_class.new(extract_pages: true)
45
+ hash = config.to_h
46
+
47
+ expect(hash).to be_a Hash
48
+ expect(hash[:extract_pages]).to be true
49
+ expect(hash[:insert_page_markers]).to be false
50
+ expect(hash[:marker_format]).to eq "\n\n<!-- PAGE {page_num} -->\n\n"
51
+ end
52
+
53
+ it 'always includes all keys in hash' do
54
+ config = described_class.new
55
+ hash = config.to_h
56
+
57
+ expect(hash.keys).to contain_exactly(
58
+ :extract_pages,
59
+ :insert_page_markers,
60
+ :marker_format
61
+ )
62
+ end
63
+ end
64
+
65
+ describe 'validation' do
66
+ it 'accepts boolean extract_pages' do
67
+ expect do
68
+ described_class.new(extract_pages: true)
69
+ end.not_to raise_error
70
+ end
71
+
72
+ it 'accepts boolean insert_page_markers' do
73
+ expect do
74
+ described_class.new(insert_page_markers: true)
75
+ end.not_to raise_error
76
+ end
77
+
78
+ it 'accepts custom marker formats' do
79
+ expect do
80
+ described_class.new(marker_format: '===== PAGE {page_num} =====')
81
+ end.not_to raise_error
82
+ end
83
+ end
84
+
85
+ describe 'keyword arguments' do
86
+ it 'accepts all keyword arguments' do
87
+ config = described_class.new(
88
+ extract_pages: true,
89
+ insert_page_markers: true,
90
+ marker_format: 'Page: {page_num}'
91
+ )
92
+
93
+ expect(config.extract_pages).to be true
94
+ expect(config.insert_page_markers).to be true
95
+ expect(config.marker_format).to eq 'Page: {page_num}'
96
+ end
97
+ end
98
+
99
+ describe 'equality' do
100
+ it 'compares configs by value' do
101
+ config1 = described_class.new(
102
+ extract_pages: true,
103
+ insert_page_markers: true,
104
+ marker_format: '--- PAGE {page_num} ---'
105
+ )
106
+ config2 = described_class.new(
107
+ extract_pages: true,
108
+ insert_page_markers: true,
109
+ marker_format: '--- PAGE {page_num} ---'
110
+ )
111
+
112
+ expect(config1.extract_pages).to eq config2.extract_pages
113
+ expect(config1.insert_page_markers).to eq config2.insert_page_markers
114
+ expect(config1.marker_format).to eq config2.marker_format
115
+ end
116
+
117
+ it 'detects differences in extract_pages' do
118
+ config1 = described_class.new(extract_pages: true)
119
+ config2 = described_class.new(extract_pages: false)
120
+
121
+ expect(config1.extract_pages).not_to eq config2.extract_pages
122
+ end
123
+
124
+ it 'detects differences in marker_format' do
125
+ config1 = described_class.new(marker_format: 'Format A')
126
+ config2 = described_class.new(marker_format: 'Format B')
127
+
128
+ expect(config1.marker_format).not_to eq config2.marker_format
129
+ end
130
+ end
131
+
132
+ describe 'nested config integration' do
133
+ it 'can be nested in Extraction config' do
134
+ pages = described_class.new(extract_pages: true)
135
+ extraction = Kreuzberg::Config::Extraction.new(pages: pages)
136
+
137
+ expect(extraction.pages).to be_a described_class
138
+ expect(extraction.pages.extract_pages).to be true
139
+ end
140
+
141
+ it 'accepts hash in Extraction config' do
142
+ extraction = Kreuzberg::Config::Extraction.new(
143
+ pages: { extract_pages: true, insert_page_markers: true }
144
+ )
145
+
146
+ expect(extraction.pages).to be_a described_class
147
+ expect(extraction.pages.extract_pages).to be true
148
+ expect(extraction.pages.insert_page_markers).to be true
149
+ end
150
+ end
151
+
152
+ describe 'marker format' do
153
+ it 'preserves custom marker format' do
154
+ format = '=== PAGE {page_num} ==='
155
+ config = described_class.new(marker_format: format)
156
+
157
+ expect(config.marker_format).to eq format
158
+ end
159
+
160
+ it 'preserves default marker format' do
161
+ config = described_class.new
162
+
163
+ expect(config.marker_format).to include '{page_num}'
164
+ end
165
+
166
+ it 'allows empty marker format' do
167
+ config = described_class.new(marker_format: '')
168
+
169
+ expect(config.marker_format).to eq ''
170
+ end
171
+
172
+ it 'handles multiline marker formats' do
173
+ format = "\n--- PAGE {page_num} ---\n"
174
+ config = described_class.new(marker_format: format)
175
+
176
+ expect(config.marker_format).to eq format
177
+ end
178
+ end
179
+
180
+ describe 'symbol vs string key handling' do
181
+ it 'converts symbol values to strings' do
182
+ config = described_class.new(marker_format: :default_format)
183
+
184
+ expect(config.marker_format).to be_a String
185
+ end
186
+
187
+ it 'preserves string marker format' do
188
+ format = 'Custom Format'
189
+ config = described_class.new(marker_format: format)
190
+
191
+ expect(config.marker_format).to eq format
192
+ expect(config.marker_format).to be_a String
193
+ end
194
+ end
195
+
196
+ describe 'boolean conversion' do
197
+ it 'converts truthy extract_pages to true' do
198
+ config = described_class.new(extract_pages: 1)
199
+
200
+ expect(config.extract_pages).to be true
201
+ end
202
+
203
+ it 'converts false extract_pages to false' do
204
+ config = described_class.new(extract_pages: false)
205
+
206
+ expect(config.extract_pages).to be false
207
+ end
208
+
209
+ it 'converts truthy insert_page_markers to true' do
210
+ config = described_class.new(insert_page_markers: 'yes')
211
+
212
+ expect(config.insert_page_markers).to be true
213
+ end
214
+
215
+ it 'converts false insert_page_markers to false' do
216
+ config = described_class.new(insert_page_markers: false)
217
+
218
+ expect(config.insert_page_markers).to be false
219
+ end
220
+ end
221
+ end
@@ -0,0 +1,267 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::PDF do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.extract_images).to be false
9
+ expect(config.passwords).to be_nil
10
+ expect(config.extract_metadata).to be true
11
+ expect(config.font_config).to be_nil
12
+ expect(config.hierarchy).to be_nil
13
+ end
14
+
15
+ it 'creates config with custom values' do
16
+ config = described_class.new(
17
+ extract_images: true,
18
+ passwords: %w[secret backup],
19
+ extract_metadata: false
20
+ )
21
+
22
+ expect(config.extract_images).to be true
23
+ expect(config.passwords).to eq %w[secret backup]
24
+ expect(config.extract_metadata).to be false
25
+ end
26
+
27
+ it 'accepts passwords as single string' do
28
+ config = described_class.new(passwords: 'secret')
29
+
30
+ expect(config.passwords).to eq ['secret']
31
+ expect(config.passwords).to be_a Array
32
+ end
33
+
34
+ it 'accepts passwords as array of strings' do
35
+ config = described_class.new(passwords: %w[pwd1 pwd2 pwd3])
36
+
37
+ expect(config.passwords).to eq %w[pwd1 pwd2 pwd3]
38
+ end
39
+
40
+ it 'converts passwords to strings' do
41
+ config = described_class.new(passwords: [123, :symbol])
42
+
43
+ expect(config.passwords).to eq %w[123 symbol]
44
+ expect(config.passwords.all?(String)).to be true
45
+ end
46
+
47
+ it 'accepts font_config as instance' do
48
+ font_config = Kreuzberg::Config::FontConfig.new(enabled: true)
49
+ config = described_class.new(font_config: font_config)
50
+
51
+ expect(config.font_config).to be_a Kreuzberg::Config::FontConfig
52
+ end
53
+
54
+ it 'converts font_config hash to instance' do
55
+ config = described_class.new(font_config: { enabled: false })
56
+
57
+ expect(config.font_config).to be_a Kreuzberg::Config::FontConfig
58
+ expect(config.font_config.enabled).to be false
59
+ end
60
+
61
+ it 'accepts hierarchy as instance' do
62
+ hierarchy = Kreuzberg::Config::Hierarchy.new(enabled: true)
63
+ config = described_class.new(hierarchy: hierarchy)
64
+
65
+ expect(config.hierarchy).to be_a Kreuzberg::Config::Hierarchy
66
+ end
67
+
68
+ it 'converts hierarchy hash to instance' do
69
+ config = described_class.new(hierarchy: { enabled: true, k_clusters: 8 })
70
+
71
+ expect(config.hierarchy).to be_a Kreuzberg::Config::Hierarchy
72
+ expect(config.hierarchy.enabled).to be true
73
+ end
74
+ end
75
+
76
+ describe '#to_h' do
77
+ it 'serializes to hash with default values' do
78
+ config = described_class.new
79
+ hash = config.to_h
80
+
81
+ expect(hash).to be_a Hash
82
+ expect(hash[:extract_images]).to be false
83
+ expect(hash[:extract_metadata]).to be true
84
+ end
85
+
86
+ it 'includes passwords in hash when present' do
87
+ config = described_class.new(passwords: %w[secret backup])
88
+ hash = config.to_h
89
+
90
+ expect(hash[:passwords]).to eq %w[secret backup]
91
+ end
92
+
93
+ it 'includes font_config in hash when present' do
94
+ config = described_class.new(font_config: { enabled: true })
95
+ hash = config.to_h
96
+
97
+ expect(hash[:font_config]).to be_a Hash
98
+ end
99
+
100
+ it 'includes hierarchy in hash when present' do
101
+ config = described_class.new(hierarchy: { enabled: true })
102
+ hash = config.to_h
103
+
104
+ expect(hash[:hierarchy]).to be_a Hash
105
+ end
106
+
107
+ it 'compacts nil values from hash' do
108
+ config = described_class.new(extract_images: true)
109
+ hash = config.to_h
110
+
111
+ expect(hash.key?(:passwords)).to be false
112
+ expect(hash.key?(:font_config)).to be false
113
+ end
114
+ end
115
+
116
+ describe 'validation' do
117
+ it 'rejects invalid font_config type' do
118
+ expect do
119
+ described_class.new(font_config: 'invalid')
120
+ end.to raise_error ArgumentError, /Expected.*FontConfig.*Hash.*nil/
121
+ end
122
+
123
+ it 'rejects invalid hierarchy type' do
124
+ expect do
125
+ described_class.new(hierarchy: 'invalid')
126
+ end.to raise_error ArgumentError, /Expected.*Hierarchy.*Hash.*nil/
127
+ end
128
+
129
+ it 'accepts valid boolean extract_images' do
130
+ expect do
131
+ described_class.new(extract_images: true)
132
+ end.not_to raise_error
133
+ end
134
+ end
135
+
136
+ describe 'keyword arguments' do
137
+ it 'accepts all keyword arguments' do
138
+ config = described_class.new(
139
+ extract_images: true,
140
+ passwords: %w[pwd1 pwd2],
141
+ extract_metadata: false,
142
+ font_config: { enabled: true },
143
+ hierarchy: { k_clusters: 10 }
144
+ )
145
+
146
+ expect(config.extract_images).to be true
147
+ expect(config.passwords).to eq %w[pwd1 pwd2]
148
+ expect(config.extract_metadata).to be false
149
+ expect(config.font_config).to be_a Kreuzberg::Config::FontConfig
150
+ expect(config.hierarchy).to be_a Kreuzberg::Config::Hierarchy
151
+ end
152
+ end
153
+
154
+ describe 'equality' do
155
+ it 'compares configs by value' do
156
+ config1 = described_class.new(
157
+ extract_images: true,
158
+ extract_metadata: false
159
+ )
160
+ config2 = described_class.new(
161
+ extract_images: true,
162
+ extract_metadata: false
163
+ )
164
+
165
+ expect(config1.extract_images).to eq config2.extract_images
166
+ expect(config1.extract_metadata).to eq config2.extract_metadata
167
+ end
168
+
169
+ it 'detects differences in extract_images' do
170
+ config1 = described_class.new(extract_images: true)
171
+ config2 = described_class.new(extract_images: false)
172
+
173
+ expect(config1.extract_images).not_to eq config2.extract_images
174
+ end
175
+
176
+ it 'detects differences in passwords' do
177
+ config1 = described_class.new(passwords: %w[pwd1])
178
+ config2 = described_class.new(passwords: %w[pwd2])
179
+
180
+ expect(config1.passwords).not_to eq config2.passwords
181
+ end
182
+ end
183
+
184
+ describe 'nested config integration' do
185
+ it 'can be nested in Extraction config' do
186
+ pdf = described_class.new(extract_images: true)
187
+ extraction = Kreuzberg::Config::Extraction.new(pdf_options: pdf)
188
+
189
+ expect(extraction.pdf_options).to be_a described_class
190
+ expect(extraction.pdf_options.extract_images).to be true
191
+ end
192
+
193
+ it 'accepts hash in Extraction config' do
194
+ extraction = Kreuzberg::Config::Extraction.new(
195
+ pdf_options: { extract_images: true, passwords: ['secret'] }
196
+ )
197
+
198
+ expect(extraction.pdf_options).to be_a described_class
199
+ expect(extraction.pdf_options.extract_images).to be true
200
+ expect(extraction.pdf_options.passwords).to eq ['secret']
201
+ end
202
+ end
203
+
204
+ describe 'font_config assignment' do
205
+ it 'allows setting font_config after initialization' do
206
+ config = described_class.new
207
+ font_config = Kreuzberg::Config::FontConfig.new(enabled: true)
208
+ config.font_config = font_config
209
+
210
+ expect(config.font_config).to be_a Kreuzberg::Config::FontConfig
211
+ expect(config.font_config.enabled).to be true
212
+ end
213
+
214
+ it 'converts hash to font_config instance on assignment' do
215
+ config = described_class.new
216
+ config.font_config = { enabled: false }
217
+
218
+ expect(config.font_config).to be_a Kreuzberg::Config::FontConfig
219
+ expect(config.font_config.enabled).to be false
220
+ end
221
+ end
222
+
223
+ describe 'hierarchy assignment' do
224
+ it 'allows setting hierarchy after initialization' do
225
+ config = described_class.new
226
+ hierarchy = Kreuzberg::Config::Hierarchy.new(enabled: true)
227
+ config.hierarchy = hierarchy
228
+
229
+ expect(config.hierarchy).to be_a Kreuzberg::Config::Hierarchy
230
+ expect(config.hierarchy.enabled).to be true
231
+ end
232
+
233
+ it 'converts hash to hierarchy instance on assignment' do
234
+ config = described_class.new
235
+ config.hierarchy = { enabled: true, k_clusters: 6 }
236
+
237
+ expect(config.hierarchy).to be_a Kreuzberg::Config::Hierarchy
238
+ expect(config.hierarchy.enabled).to be true
239
+ end
240
+ end
241
+
242
+ describe 'boolean conversion' do
243
+ it 'converts truthy extract_images to true' do
244
+ config = described_class.new(extract_images: 1)
245
+
246
+ expect(config.extract_images).to be true
247
+ end
248
+
249
+ it 'converts false extract_images to false' do
250
+ config = described_class.new(extract_images: false)
251
+
252
+ expect(config.extract_images).to be false
253
+ end
254
+
255
+ it 'converts truthy extract_metadata to true' do
256
+ config = described_class.new(extract_metadata: 'yes')
257
+
258
+ expect(config.extract_metadata).to be true
259
+ end
260
+
261
+ it 'converts false extract_metadata to false' do
262
+ config = described_class.new(extract_metadata: false)
263
+
264
+ expect(config.extract_metadata).to be false
265
+ end
266
+ end
267
+ end