kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,307 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe 'OCR Backend Plugin System' do
6
+ let(:test_image) { test_document_path('images/invoice_image.png') }
7
+
8
+ describe 'registering custom OCR backend' do
9
+ it 'registers and uses custom OCR backend class' do
10
+ class MockOcrBackend
11
+ include Kreuzberg::OcrBackendProtocol
12
+
13
+ attr_reader :process_called, :received_config
14
+
15
+ def initialize
16
+ @process_called = false
17
+ @received_config = nil
18
+ end
19
+
20
+ def name
21
+ 'mock-ocr'
22
+ end
23
+
24
+ def process_image(_image_bytes, config)
25
+ @process_called = true
26
+ @received_config = config
27
+ 'Mocked OCR text from custom backend'
28
+ end
29
+ end
30
+
31
+ backend = MockOcrBackend.new
32
+ Kreuzberg.register_ocr_backend('mock-ocr', backend)
33
+
34
+ config = Kreuzberg::Config::Extraction.new(
35
+ force_ocr: true,
36
+ ocr: Kreuzberg::Config::OCR.new(backend: 'mock-ocr')
37
+ )
38
+
39
+ result = Kreuzberg.extract_file_sync(path: test_image, config: config)
40
+
41
+ expect(backend.process_called).to be true
42
+ expect(result.content).to include('Mocked OCR text')
43
+ end
44
+
45
+ it 'passes correct configuration to OCR backend' do
46
+ class ConfigCapturingBackend
47
+ include Kreuzberg::OcrBackendProtocol
48
+
49
+ attr_reader :received_config
50
+
51
+ def name
52
+ 'config-capture'
53
+ end
54
+
55
+ def process_image(_image_bytes, config)
56
+ @received_config = config
57
+ 'OCR result'
58
+ end
59
+ end
60
+
61
+ backend = ConfigCapturingBackend.new
62
+ Kreuzberg.register_ocr_backend('config-capture', backend)
63
+
64
+ config = Kreuzberg::Config::Extraction.new(
65
+ force_ocr: true,
66
+ ocr: Kreuzberg::Config::OCR.new(
67
+ backend: 'config-capture',
68
+ language: 'eng'
69
+ )
70
+ )
71
+
72
+ Kreuzberg.extract_file_sync(path: test_image, config: config)
73
+
74
+ expect(backend.received_config).to be_a(Hash)
75
+ expect(backend.received_config['backend']).to eq('config-capture')
76
+ expect(backend.received_config['language']).to eq('eng')
77
+ end
78
+ end
79
+
80
+ describe 'OCR backend receives correct parameters' do
81
+ it 'receives image bytes as binary data' do
82
+ class BytesCapturingBackend
83
+ include Kreuzberg::OcrBackendProtocol
84
+
85
+ attr_accessor :received_bytes
86
+
87
+ def name
88
+ 'bytes-capture'
89
+ end
90
+
91
+ def process_image(image_bytes, _config)
92
+ self.class.instance_variable_set(:@received_bytes, image_bytes)
93
+ 'OCR result'
94
+ end
95
+ end
96
+
97
+ backend = BytesCapturingBackend.new
98
+ Kreuzberg.register_ocr_backend('bytes-capture', backend)
99
+
100
+ config = Kreuzberg::Config::Extraction.new(
101
+ force_ocr: true,
102
+ ocr: Kreuzberg::Config::OCR.new(backend: 'bytes-capture')
103
+ )
104
+
105
+ Kreuzberg.extract_file_sync(path: test_image, config: config)
106
+
107
+ received_bytes = BytesCapturingBackend.instance_variable_get(:@received_bytes)
108
+ expect(received_bytes).to be_a(String)
109
+ expect(received_bytes.encoding).to eq(Encoding::BINARY)
110
+ expect(received_bytes.length).to be_positive
111
+ end
112
+
113
+ it 'backend can return extracted text' do
114
+ class SimpleOcrBackend
115
+ include Kreuzberg::OcrBackendProtocol
116
+
117
+ def name
118
+ 'simple-ocr'
119
+ end
120
+
121
+ def process_image(_image_bytes, _config)
122
+ 'Invoice Total: $1,234.56'
123
+ end
124
+ end
125
+
126
+ backend = SimpleOcrBackend.new
127
+ Kreuzberg.register_ocr_backend('simple-ocr', backend)
128
+
129
+ config = Kreuzberg::Config::Extraction.new(
130
+ force_ocr: true,
131
+ ocr: Kreuzberg::Config::OCR.new(backend: 'simple-ocr')
132
+ )
133
+
134
+ result = Kreuzberg.extract_file_sync(path: test_image, config: config)
135
+
136
+ expect(result.content).to include('Invoice Total')
137
+ expect(result.content).to include('1,234.56')
138
+ end
139
+ end
140
+
141
+ describe 'OCR backend with stateful processing' do
142
+ it 'maintains state across multiple invocations' do
143
+ class StatefulOcrBackend
144
+ include Kreuzberg::OcrBackendProtocol
145
+
146
+ attr_reader :call_count
147
+
148
+ def initialize
149
+ @call_count = 0
150
+ end
151
+
152
+ def name
153
+ 'stateful-ocr'
154
+ end
155
+
156
+ def process_image(_image_bytes, _config)
157
+ @call_count += 1
158
+ "OCR call number #{@call_count}"
159
+ end
160
+ end
161
+
162
+ backend = StatefulOcrBackend.new
163
+ Kreuzberg.register_ocr_backend('stateful-ocr', backend)
164
+
165
+ config = Kreuzberg::Config::Extraction.new(
166
+ force_ocr: true,
167
+ ocr: Kreuzberg::Config::OCR.new(backend: 'stateful-ocr')
168
+ )
169
+
170
+ Kreuzberg.extract_file_sync(path: test_image, config: config)
171
+ Kreuzberg.extract_file_sync(path: test_image, config: config)
172
+
173
+ expect(backend.call_count).to be >= 1
174
+ end
175
+ end
176
+
177
+ describe 'error handling' do
178
+ it 'propagates errors from OCR backend' do
179
+ class FailingOcrBackend
180
+ include Kreuzberg::OcrBackendProtocol
181
+
182
+ def name
183
+ 'failing-ocr'
184
+ end
185
+
186
+ def process_image(_image_bytes, _config)
187
+ raise StandardError, 'OCR processing failed'
188
+ end
189
+ end
190
+
191
+ backend = FailingOcrBackend.new
192
+ Kreuzberg.register_ocr_backend('failing-ocr', backend)
193
+
194
+ config = Kreuzberg::Config::Extraction.new(
195
+ force_ocr: true,
196
+ ocr: Kreuzberg::Config::OCR.new(backend: 'failing-ocr')
197
+ )
198
+
199
+ expect do
200
+ Kreuzberg.extract_file_sync(path: test_image, config: config)
201
+ end.to raise_error(StandardError, /OCR processing failed/)
202
+ end
203
+
204
+ it 'handles missing OCR backend gracefully' do
205
+ config = Kreuzberg::Config::Extraction.new(
206
+ force_ocr: true,
207
+ ocr: Kreuzberg::Config::OCR.new(backend: 'nonexistent-backend')
208
+ )
209
+
210
+ expect do
211
+ Kreuzberg.extract_file_sync(path: test_image, config: config)
212
+ end.to raise_error
213
+ end
214
+ end
215
+
216
+ describe 'OCR backend protocol implementation' do
217
+ it 'requires name method' do
218
+ class InvalidBackendNoName
219
+ def process_image(_image_bytes, _config)
220
+ 'text'
221
+ end
222
+ end
223
+
224
+ backend = InvalidBackendNoName.new
225
+
226
+ expect do
227
+ Kreuzberg.register_ocr_backend('invalid', backend)
228
+ end.to raise_error
229
+ end
230
+
231
+ it 'requires process_image method' do
232
+ class InvalidBackendNoProcess
233
+ def name
234
+ 'invalid'
235
+ end
236
+ end
237
+
238
+ backend = InvalidBackendNoProcess.new
239
+
240
+ expect do
241
+ Kreuzberg.register_ocr_backend('invalid', backend)
242
+ end.to raise_error
243
+ end
244
+ end
245
+
246
+ describe 'OCR backend management' do
247
+ describe '.list_ocr_backends' do
248
+ it 'returns an array of backend names' do
249
+ backends = Kreuzberg.list_ocr_backends
250
+ expect(backends).to be_an(Array)
251
+ end
252
+
253
+ it 'includes registered backends' do
254
+ class ListTestBackend
255
+ include Kreuzberg::OcrBackendProtocol
256
+
257
+ def name
258
+ 'list-test-backend'
259
+ end
260
+
261
+ def process_image(_image_bytes, _config)
262
+ 'test'
263
+ end
264
+ end
265
+
266
+ backend = ListTestBackend.new
267
+ Kreuzberg.register_ocr_backend('list-test-backend', backend)
268
+
269
+ backends = Kreuzberg.list_ocr_backends
270
+ expect(backends).to include('list-test-backend')
271
+
272
+ Kreuzberg.unregister_ocr_backend('list-test-backend')
273
+ end
274
+ end
275
+
276
+ describe '.unregister_ocr_backend' do
277
+ it 'removes backend from registry' do
278
+ class UnregisterTestBackend
279
+ include Kreuzberg::OcrBackendProtocol
280
+
281
+ def name
282
+ 'unregister-test'
283
+ end
284
+
285
+ def process_image(_image_bytes, _config)
286
+ 'test'
287
+ end
288
+ end
289
+
290
+ backend = UnregisterTestBackend.new
291
+ Kreuzberg.register_ocr_backend('unregister-test', backend)
292
+
293
+ backends = Kreuzberg.list_ocr_backends
294
+ expect(backends).to include('unregister-test')
295
+
296
+ Kreuzberg.unregister_ocr_backend('unregister-test')
297
+
298
+ backends = Kreuzberg.list_ocr_backends
299
+ expect(backends).not_to include('unregister-test')
300
+ end
301
+
302
+ it 'accepts nonexistent backend name without error' do
303
+ expect { Kreuzberg.unregister_ocr_backend('nonexistent-backend-xyz') }.not_to raise_error
304
+ end
305
+ end
306
+ end
307
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe 'PostProcessor Plugin System' do
6
+ let(:test_pdf) { test_document_path('text/contract_test.txt') }
7
+
8
+ after do
9
+ Kreuzberg.clear_post_processors
10
+ end
11
+
12
+ describe 'registering post-processor as Proc' do
13
+ it 'registers and executes Proc post-processor during extraction' do
14
+ processor_called = false
15
+ processor = lambda do |result|
16
+ processor_called = true
17
+ result['content'] = result['content'].upcase
18
+ result
19
+ end
20
+
21
+ Kreuzberg.register_post_processor('upcase', processor)
22
+ processors = Kreuzberg.list_post_processors
23
+
24
+ expect(processors).to include('upcase')
25
+ end
26
+
27
+ it 'allows post-processor to modify result content' do
28
+ processor = lambda do |result|
29
+ result['content'] = "[PROCESSED] #{result['content']}"
30
+ result
31
+ end
32
+
33
+ Kreuzberg.register_post_processor('prefix', processor)
34
+ processors = Kreuzberg.list_post_processors
35
+
36
+ expect(processors).to include('prefix')
37
+ end
38
+
39
+ it 'allows post-processor to add metadata' do
40
+ processor = lambda do |result|
41
+ result['metadata']['custom_field'] = 'custom_value'
42
+ result['metadata']['word_count'] = result['content'].split.length
43
+ result
44
+ end
45
+
46
+ Kreuzberg.register_post_processor('metadata_adder', processor)
47
+ processors = Kreuzberg.list_post_processors
48
+
49
+ expect(processors).to include('metadata_adder')
50
+ end
51
+ end
52
+
53
+ describe 'registering post-processor as class' do
54
+ it 'registers and executes class-based post-processor' do
55
+ class WordCountProcessor
56
+ include Kreuzberg::PostProcessorProtocol
57
+
58
+ def call(result)
59
+ word_count = result['content'].split.length
60
+ result['metadata']['word_count'] = word_count
61
+ result['metadata']['processor_name'] = 'WordCountProcessor'
62
+ result
63
+ end
64
+ end
65
+
66
+ processor = WordCountProcessor.new
67
+ Kreuzberg.register_post_processor('word_count', processor)
68
+ processors = Kreuzberg.list_post_processors
69
+
70
+ expect(processors).to include('word_count')
71
+ end
72
+
73
+ it 'allows class-based processor to transform content' do
74
+ class TruncateProcessor
75
+ include Kreuzberg::PostProcessorProtocol
76
+
77
+ def initialize(max_length)
78
+ @max_length = max_length
79
+ end
80
+
81
+ def call(result)
82
+ result['content'] = "#{result['content'][0...@max_length]}..." if result['content'].length > @max_length
83
+ result
84
+ end
85
+ end
86
+
87
+ processor = TruncateProcessor.new(50)
88
+ Kreuzberg.register_post_processor('truncate', processor)
89
+ processors = Kreuzberg.list_post_processors
90
+
91
+ expect(processors).to include('truncate')
92
+ end
93
+ end
94
+
95
+ describe 'multiple post-processors' do
96
+ it 'executes multiple registered post-processors in order' do
97
+ processor1 = lambda do |result|
98
+ result['metadata']['processor1'] = 'executed'
99
+ result
100
+ end
101
+
102
+ processor2 = lambda do |result|
103
+ result['metadata']['processor2'] = 'executed'
104
+ result
105
+ end
106
+
107
+ Kreuzberg.register_post_processor('proc1', processor1)
108
+ Kreuzberg.register_post_processor('proc2', processor2)
109
+ processors = Kreuzberg.list_post_processors
110
+
111
+ expect(processors).to include('proc1')
112
+ expect(processors).to include('proc2')
113
+ end
114
+ end
115
+
116
+ describe 'unregister_post_processor' do
117
+ it 'removes a registered post-processor by name' do
118
+ processor = lambda do |result|
119
+ result['metadata']['should_not_appear'] = 'value'
120
+ result
121
+ end
122
+
123
+ Kreuzberg.register_post_processor('removable', processor)
124
+ Kreuzberg.unregister_post_processor('removable')
125
+ result = Kreuzberg.extract_file_sync(path: test_pdf)
126
+
127
+ expect(result.metadata['should_not_appear']).to be_nil
128
+ end
129
+
130
+ it 'does not affect other registered post-processors' do
131
+ processor1 = lambda do |result|
132
+ result['metadata']['keep1'] = 'value1'
133
+ result
134
+ end
135
+
136
+ processor2 = lambda do |result|
137
+ result['metadata']['remove'] = 'value2'
138
+ result
139
+ end
140
+
141
+ processor3 = lambda do |result|
142
+ result['metadata']['keep3'] = 'value3'
143
+ result
144
+ end
145
+
146
+ Kreuzberg.register_post_processor('keep1', processor1)
147
+ Kreuzberg.register_post_processor('remove', processor2)
148
+ Kreuzberg.register_post_processor('keep3', processor3)
149
+
150
+ processors_before = Kreuzberg.list_post_processors
151
+ expect(processors_before).to include('keep1')
152
+ expect(processors_before).to include('remove')
153
+ expect(processors_before).to include('keep3')
154
+
155
+ Kreuzberg.unregister_post_processor('remove')
156
+ processors_after = Kreuzberg.list_post_processors
157
+
158
+ expect(processors_after).to include('keep1')
159
+ expect(processors_after).not_to include('remove')
160
+ expect(processors_after).to include('keep3')
161
+ end
162
+ end
163
+
164
+ describe 'clear_post_processors' do
165
+ it 'removes all registered post-processors' do
166
+ processor1 = lambda do |result|
167
+ result['metadata']['proc1'] = 'value1'
168
+ result
169
+ end
170
+
171
+ processor2 = lambda do |result|
172
+ result['metadata']['proc2'] = 'value2'
173
+ result
174
+ end
175
+
176
+ Kreuzberg.register_post_processor('proc1', processor1)
177
+ Kreuzberg.register_post_processor('proc2', processor2)
178
+
179
+ Kreuzberg.clear_post_processors
180
+ result = Kreuzberg.extract_file_sync(path: test_pdf)
181
+
182
+ expect(result.metadata['proc1']).to be_nil
183
+ expect(result.metadata['proc2']).to be_nil
184
+ end
185
+ end
186
+
187
+ describe 'error handling' do
188
+ it 'propagates errors from post-processor' do
189
+ processor = lambda do |_result|
190
+ raise StandardError, 'Post-processor error'
191
+ end
192
+
193
+ Kreuzberg.register_post_processor('failing', processor)
194
+ processors = Kreuzberg.list_post_processors
195
+
196
+ expect(processors).to include('failing')
197
+ end
198
+
199
+ it 'handles post-processor that returns invalid result' do
200
+ processor = lambda do |_result|
201
+ 'invalid return value'
202
+ end
203
+
204
+ Kreuzberg.register_post_processor('invalid', processor)
205
+ processors = Kreuzberg.list_post_processors
206
+
207
+ expect(processors).to include('invalid')
208
+ end
209
+ end
210
+
211
+ describe 'list_post_processors' do
212
+ it 'returns empty array when no post-processors registered' do
213
+ Kreuzberg.clear_post_processors
214
+ processors = Kreuzberg.list_post_processors
215
+ expect(processors).to be_an(Array)
216
+ expect(processors).to be_empty
217
+ end
218
+
219
+ it 'returns post-processor names after registration' do
220
+ Kreuzberg.clear_post_processors
221
+ processor = lambda do |result|
222
+ result['content'] = result['content'].upcase
223
+ result
224
+ end
225
+ Kreuzberg.register_post_processor('test-processor', processor)
226
+ processors = Kreuzberg.list_post_processors
227
+ expect(processors).to include('test-processor')
228
+ Kreuzberg.clear_post_processors
229
+ end
230
+
231
+ it 'returns all registered post-processor names' do
232
+ Kreuzberg.clear_post_processors
233
+ processor1 = lambda do |result|
234
+ result
235
+ end
236
+ processor2 = lambda do |result|
237
+ result
238
+ end
239
+ processor3 = lambda do |result|
240
+ result
241
+ end
242
+
243
+ Kreuzberg.register_post_processor('processor-one', processor1)
244
+ Kreuzberg.register_post_processor('processor-two', processor2)
245
+ Kreuzberg.register_post_processor('processor-three', processor3)
246
+
247
+ processors = Kreuzberg.list_post_processors
248
+ expect(processors).to contain_exactly('processor-one', 'processor-two', 'processor-three')
249
+ Kreuzberg.clear_post_processors
250
+ end
251
+
252
+ it 'reflects changes after unregistration' do
253
+ Kreuzberg.clear_post_processors
254
+ processor = lambda do |result|
255
+ result
256
+ end
257
+ Kreuzberg.register_post_processor('temp-processor', processor)
258
+
259
+ processors_before = Kreuzberg.list_post_processors
260
+ expect(processors_before).to include('temp-processor')
261
+
262
+ Kreuzberg.unregister_post_processor('temp-processor')
263
+
264
+ processors_after = Kreuzberg.list_post_processors
265
+ expect(processors_after).not_to include('temp-processor')
266
+ Kreuzberg.clear_post_processors
267
+ end
268
+ end
269
+ end