kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe 'OCR Backend Plugin System' do
|
|
6
|
+
let(:test_image) { test_document_path('images/invoice_image.png') }
|
|
7
|
+
|
|
8
|
+
describe 'registering custom OCR backend' do
|
|
9
|
+
it 'registers and uses custom OCR backend class' do
|
|
10
|
+
class MockOcrBackend
|
|
11
|
+
include Kreuzberg::OcrBackendProtocol
|
|
12
|
+
|
|
13
|
+
attr_reader :process_called, :received_config
|
|
14
|
+
|
|
15
|
+
def initialize
|
|
16
|
+
@process_called = false
|
|
17
|
+
@received_config = nil
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def name
|
|
21
|
+
'mock-ocr'
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def process_image(_image_bytes, config)
|
|
25
|
+
@process_called = true
|
|
26
|
+
@received_config = config
|
|
27
|
+
'Mocked OCR text from custom backend'
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
backend = MockOcrBackend.new
|
|
32
|
+
Kreuzberg.register_ocr_backend('mock-ocr', backend)
|
|
33
|
+
|
|
34
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
35
|
+
force_ocr: true,
|
|
36
|
+
ocr: Kreuzberg::Config::OCR.new(backend: 'mock-ocr')
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
result = Kreuzberg.extract_file_sync(path: test_image, config: config)
|
|
40
|
+
|
|
41
|
+
expect(backend.process_called).to be true
|
|
42
|
+
expect(result.content).to include('Mocked OCR text')
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
it 'passes correct configuration to OCR backend' do
|
|
46
|
+
class ConfigCapturingBackend
|
|
47
|
+
include Kreuzberg::OcrBackendProtocol
|
|
48
|
+
|
|
49
|
+
attr_reader :received_config
|
|
50
|
+
|
|
51
|
+
def name
|
|
52
|
+
'config-capture'
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def process_image(_image_bytes, config)
|
|
56
|
+
@received_config = config
|
|
57
|
+
'OCR result'
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
backend = ConfigCapturingBackend.new
|
|
62
|
+
Kreuzberg.register_ocr_backend('config-capture', backend)
|
|
63
|
+
|
|
64
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
65
|
+
force_ocr: true,
|
|
66
|
+
ocr: Kreuzberg::Config::OCR.new(
|
|
67
|
+
backend: 'config-capture',
|
|
68
|
+
language: 'eng'
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
Kreuzberg.extract_file_sync(path: test_image, config: config)
|
|
73
|
+
|
|
74
|
+
expect(backend.received_config).to be_a(Hash)
|
|
75
|
+
expect(backend.received_config['backend']).to eq('config-capture')
|
|
76
|
+
expect(backend.received_config['language']).to eq('eng')
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
describe 'OCR backend receives correct parameters' do
|
|
81
|
+
it 'receives image bytes as binary data' do
|
|
82
|
+
class BytesCapturingBackend
|
|
83
|
+
include Kreuzberg::OcrBackendProtocol
|
|
84
|
+
|
|
85
|
+
attr_accessor :received_bytes
|
|
86
|
+
|
|
87
|
+
def name
|
|
88
|
+
'bytes-capture'
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def process_image(image_bytes, _config)
|
|
92
|
+
self.class.instance_variable_set(:@received_bytes, image_bytes)
|
|
93
|
+
'OCR result'
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
backend = BytesCapturingBackend.new
|
|
98
|
+
Kreuzberg.register_ocr_backend('bytes-capture', backend)
|
|
99
|
+
|
|
100
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
101
|
+
force_ocr: true,
|
|
102
|
+
ocr: Kreuzberg::Config::OCR.new(backend: 'bytes-capture')
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
Kreuzberg.extract_file_sync(path: test_image, config: config)
|
|
106
|
+
|
|
107
|
+
received_bytes = BytesCapturingBackend.instance_variable_get(:@received_bytes)
|
|
108
|
+
expect(received_bytes).to be_a(String)
|
|
109
|
+
expect(received_bytes.encoding).to eq(Encoding::BINARY)
|
|
110
|
+
expect(received_bytes.length).to be_positive
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
it 'backend can return extracted text' do
|
|
114
|
+
class SimpleOcrBackend
|
|
115
|
+
include Kreuzberg::OcrBackendProtocol
|
|
116
|
+
|
|
117
|
+
def name
|
|
118
|
+
'simple-ocr'
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def process_image(_image_bytes, _config)
|
|
122
|
+
'Invoice Total: $1,234.56'
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
backend = SimpleOcrBackend.new
|
|
127
|
+
Kreuzberg.register_ocr_backend('simple-ocr', backend)
|
|
128
|
+
|
|
129
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
130
|
+
force_ocr: true,
|
|
131
|
+
ocr: Kreuzberg::Config::OCR.new(backend: 'simple-ocr')
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
result = Kreuzberg.extract_file_sync(path: test_image, config: config)
|
|
135
|
+
|
|
136
|
+
expect(result.content).to include('Invoice Total')
|
|
137
|
+
expect(result.content).to include('1,234.56')
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
describe 'OCR backend with stateful processing' do
|
|
142
|
+
it 'maintains state across multiple invocations' do
|
|
143
|
+
class StatefulOcrBackend
|
|
144
|
+
include Kreuzberg::OcrBackendProtocol
|
|
145
|
+
|
|
146
|
+
attr_reader :call_count
|
|
147
|
+
|
|
148
|
+
def initialize
|
|
149
|
+
@call_count = 0
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def name
|
|
153
|
+
'stateful-ocr'
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def process_image(_image_bytes, _config)
|
|
157
|
+
@call_count += 1
|
|
158
|
+
"OCR call number #{@call_count}"
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
backend = StatefulOcrBackend.new
|
|
163
|
+
Kreuzberg.register_ocr_backend('stateful-ocr', backend)
|
|
164
|
+
|
|
165
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
166
|
+
force_ocr: true,
|
|
167
|
+
ocr: Kreuzberg::Config::OCR.new(backend: 'stateful-ocr')
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
Kreuzberg.extract_file_sync(path: test_image, config: config)
|
|
171
|
+
Kreuzberg.extract_file_sync(path: test_image, config: config)
|
|
172
|
+
|
|
173
|
+
expect(backend.call_count).to be >= 1
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
describe 'error handling' do
|
|
178
|
+
it 'propagates errors from OCR backend' do
|
|
179
|
+
class FailingOcrBackend
|
|
180
|
+
include Kreuzberg::OcrBackendProtocol
|
|
181
|
+
|
|
182
|
+
def name
|
|
183
|
+
'failing-ocr'
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def process_image(_image_bytes, _config)
|
|
187
|
+
raise StandardError, 'OCR processing failed'
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
backend = FailingOcrBackend.new
|
|
192
|
+
Kreuzberg.register_ocr_backend('failing-ocr', backend)
|
|
193
|
+
|
|
194
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
195
|
+
force_ocr: true,
|
|
196
|
+
ocr: Kreuzberg::Config::OCR.new(backend: 'failing-ocr')
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
expect do
|
|
200
|
+
Kreuzberg.extract_file_sync(path: test_image, config: config)
|
|
201
|
+
end.to raise_error(StandardError, /OCR processing failed/)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
it 'handles missing OCR backend gracefully' do
|
|
205
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
206
|
+
force_ocr: true,
|
|
207
|
+
ocr: Kreuzberg::Config::OCR.new(backend: 'nonexistent-backend')
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
expect do
|
|
211
|
+
Kreuzberg.extract_file_sync(path: test_image, config: config)
|
|
212
|
+
end.to raise_error
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
describe 'OCR backend protocol implementation' do
|
|
217
|
+
it 'requires name method' do
|
|
218
|
+
class InvalidBackendNoName
|
|
219
|
+
def process_image(_image_bytes, _config)
|
|
220
|
+
'text'
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
backend = InvalidBackendNoName.new
|
|
225
|
+
|
|
226
|
+
expect do
|
|
227
|
+
Kreuzberg.register_ocr_backend('invalid', backend)
|
|
228
|
+
end.to raise_error
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
it 'requires process_image method' do
|
|
232
|
+
class InvalidBackendNoProcess
|
|
233
|
+
def name
|
|
234
|
+
'invalid'
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
backend = InvalidBackendNoProcess.new
|
|
239
|
+
|
|
240
|
+
expect do
|
|
241
|
+
Kreuzberg.register_ocr_backend('invalid', backend)
|
|
242
|
+
end.to raise_error
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
describe 'OCR backend management' do
|
|
247
|
+
describe '.list_ocr_backends' do
|
|
248
|
+
it 'returns an array of backend names' do
|
|
249
|
+
backends = Kreuzberg.list_ocr_backends
|
|
250
|
+
expect(backends).to be_an(Array)
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
it 'includes registered backends' do
|
|
254
|
+
class ListTestBackend
|
|
255
|
+
include Kreuzberg::OcrBackendProtocol
|
|
256
|
+
|
|
257
|
+
def name
|
|
258
|
+
'list-test-backend'
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def process_image(_image_bytes, _config)
|
|
262
|
+
'test'
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
backend = ListTestBackend.new
|
|
267
|
+
Kreuzberg.register_ocr_backend('list-test-backend', backend)
|
|
268
|
+
|
|
269
|
+
backends = Kreuzberg.list_ocr_backends
|
|
270
|
+
expect(backends).to include('list-test-backend')
|
|
271
|
+
|
|
272
|
+
Kreuzberg.unregister_ocr_backend('list-test-backend')
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
describe '.unregister_ocr_backend' do
|
|
277
|
+
it 'removes backend from registry' do
|
|
278
|
+
class UnregisterTestBackend
|
|
279
|
+
include Kreuzberg::OcrBackendProtocol
|
|
280
|
+
|
|
281
|
+
def name
|
|
282
|
+
'unregister-test'
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def process_image(_image_bytes, _config)
|
|
286
|
+
'test'
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
backend = UnregisterTestBackend.new
|
|
291
|
+
Kreuzberg.register_ocr_backend('unregister-test', backend)
|
|
292
|
+
|
|
293
|
+
backends = Kreuzberg.list_ocr_backends
|
|
294
|
+
expect(backends).to include('unregister-test')
|
|
295
|
+
|
|
296
|
+
Kreuzberg.unregister_ocr_backend('unregister-test')
|
|
297
|
+
|
|
298
|
+
backends = Kreuzberg.list_ocr_backends
|
|
299
|
+
expect(backends).not_to include('unregister-test')
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
it 'accepts nonexistent backend name without error' do
|
|
303
|
+
expect { Kreuzberg.unregister_ocr_backend('nonexistent-backend-xyz') }.not_to raise_error
|
|
304
|
+
end
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
end
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe 'PostProcessor Plugin System' do
|
|
6
|
+
let(:test_pdf) { test_document_path('text/contract_test.txt') }
|
|
7
|
+
|
|
8
|
+
after do
|
|
9
|
+
Kreuzberg.clear_post_processors
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
describe 'registering post-processor as Proc' do
|
|
13
|
+
it 'registers and executes Proc post-processor during extraction' do
|
|
14
|
+
processor_called = false
|
|
15
|
+
processor = lambda do |result|
|
|
16
|
+
processor_called = true
|
|
17
|
+
result['content'] = result['content'].upcase
|
|
18
|
+
result
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
Kreuzberg.register_post_processor('upcase', processor)
|
|
22
|
+
processors = Kreuzberg.list_post_processors
|
|
23
|
+
|
|
24
|
+
expect(processors).to include('upcase')
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'allows post-processor to modify result content' do
|
|
28
|
+
processor = lambda do |result|
|
|
29
|
+
result['content'] = "[PROCESSED] #{result['content']}"
|
|
30
|
+
result
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
Kreuzberg.register_post_processor('prefix', processor)
|
|
34
|
+
processors = Kreuzberg.list_post_processors
|
|
35
|
+
|
|
36
|
+
expect(processors).to include('prefix')
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'allows post-processor to add metadata' do
|
|
40
|
+
processor = lambda do |result|
|
|
41
|
+
result['metadata']['custom_field'] = 'custom_value'
|
|
42
|
+
result['metadata']['word_count'] = result['content'].split.length
|
|
43
|
+
result
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
Kreuzberg.register_post_processor('metadata_adder', processor)
|
|
47
|
+
processors = Kreuzberg.list_post_processors
|
|
48
|
+
|
|
49
|
+
expect(processors).to include('metadata_adder')
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
describe 'registering post-processor as class' do
|
|
54
|
+
it 'registers and executes class-based post-processor' do
|
|
55
|
+
class WordCountProcessor
|
|
56
|
+
include Kreuzberg::PostProcessorProtocol
|
|
57
|
+
|
|
58
|
+
def call(result)
|
|
59
|
+
word_count = result['content'].split.length
|
|
60
|
+
result['metadata']['word_count'] = word_count
|
|
61
|
+
result['metadata']['processor_name'] = 'WordCountProcessor'
|
|
62
|
+
result
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
processor = WordCountProcessor.new
|
|
67
|
+
Kreuzberg.register_post_processor('word_count', processor)
|
|
68
|
+
processors = Kreuzberg.list_post_processors
|
|
69
|
+
|
|
70
|
+
expect(processors).to include('word_count')
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
it 'allows class-based processor to transform content' do
|
|
74
|
+
class TruncateProcessor
|
|
75
|
+
include Kreuzberg::PostProcessorProtocol
|
|
76
|
+
|
|
77
|
+
def initialize(max_length)
|
|
78
|
+
@max_length = max_length
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def call(result)
|
|
82
|
+
result['content'] = "#{result['content'][0...@max_length]}..." if result['content'].length > @max_length
|
|
83
|
+
result
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
processor = TruncateProcessor.new(50)
|
|
88
|
+
Kreuzberg.register_post_processor('truncate', processor)
|
|
89
|
+
processors = Kreuzberg.list_post_processors
|
|
90
|
+
|
|
91
|
+
expect(processors).to include('truncate')
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
describe 'multiple post-processors' do
|
|
96
|
+
it 'executes multiple registered post-processors in order' do
|
|
97
|
+
processor1 = lambda do |result|
|
|
98
|
+
result['metadata']['processor1'] = 'executed'
|
|
99
|
+
result
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
processor2 = lambda do |result|
|
|
103
|
+
result['metadata']['processor2'] = 'executed'
|
|
104
|
+
result
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
Kreuzberg.register_post_processor('proc1', processor1)
|
|
108
|
+
Kreuzberg.register_post_processor('proc2', processor2)
|
|
109
|
+
processors = Kreuzberg.list_post_processors
|
|
110
|
+
|
|
111
|
+
expect(processors).to include('proc1')
|
|
112
|
+
expect(processors).to include('proc2')
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
describe 'unregister_post_processor' do
|
|
117
|
+
it 'removes a registered post-processor by name' do
|
|
118
|
+
processor = lambda do |result|
|
|
119
|
+
result['metadata']['should_not_appear'] = 'value'
|
|
120
|
+
result
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
Kreuzberg.register_post_processor('removable', processor)
|
|
124
|
+
Kreuzberg.unregister_post_processor('removable')
|
|
125
|
+
result = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
126
|
+
|
|
127
|
+
expect(result.metadata['should_not_appear']).to be_nil
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
it 'does not affect other registered post-processors' do
|
|
131
|
+
processor1 = lambda do |result|
|
|
132
|
+
result['metadata']['keep1'] = 'value1'
|
|
133
|
+
result
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
processor2 = lambda do |result|
|
|
137
|
+
result['metadata']['remove'] = 'value2'
|
|
138
|
+
result
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
processor3 = lambda do |result|
|
|
142
|
+
result['metadata']['keep3'] = 'value3'
|
|
143
|
+
result
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
Kreuzberg.register_post_processor('keep1', processor1)
|
|
147
|
+
Kreuzberg.register_post_processor('remove', processor2)
|
|
148
|
+
Kreuzberg.register_post_processor('keep3', processor3)
|
|
149
|
+
|
|
150
|
+
processors_before = Kreuzberg.list_post_processors
|
|
151
|
+
expect(processors_before).to include('keep1')
|
|
152
|
+
expect(processors_before).to include('remove')
|
|
153
|
+
expect(processors_before).to include('keep3')
|
|
154
|
+
|
|
155
|
+
Kreuzberg.unregister_post_processor('remove')
|
|
156
|
+
processors_after = Kreuzberg.list_post_processors
|
|
157
|
+
|
|
158
|
+
expect(processors_after).to include('keep1')
|
|
159
|
+
expect(processors_after).not_to include('remove')
|
|
160
|
+
expect(processors_after).to include('keep3')
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
describe 'clear_post_processors' do
|
|
165
|
+
it 'removes all registered post-processors' do
|
|
166
|
+
processor1 = lambda do |result|
|
|
167
|
+
result['metadata']['proc1'] = 'value1'
|
|
168
|
+
result
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
processor2 = lambda do |result|
|
|
172
|
+
result['metadata']['proc2'] = 'value2'
|
|
173
|
+
result
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
Kreuzberg.register_post_processor('proc1', processor1)
|
|
177
|
+
Kreuzberg.register_post_processor('proc2', processor2)
|
|
178
|
+
|
|
179
|
+
Kreuzberg.clear_post_processors
|
|
180
|
+
result = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
181
|
+
|
|
182
|
+
expect(result.metadata['proc1']).to be_nil
|
|
183
|
+
expect(result.metadata['proc2']).to be_nil
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
describe 'error handling' do
|
|
188
|
+
it 'propagates errors from post-processor' do
|
|
189
|
+
processor = lambda do |_result|
|
|
190
|
+
raise StandardError, 'Post-processor error'
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
Kreuzberg.register_post_processor('failing', processor)
|
|
194
|
+
processors = Kreuzberg.list_post_processors
|
|
195
|
+
|
|
196
|
+
expect(processors).to include('failing')
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
it 'handles post-processor that returns invalid result' do
|
|
200
|
+
processor = lambda do |_result|
|
|
201
|
+
'invalid return value'
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
Kreuzberg.register_post_processor('invalid', processor)
|
|
205
|
+
processors = Kreuzberg.list_post_processors
|
|
206
|
+
|
|
207
|
+
expect(processors).to include('invalid')
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
describe 'list_post_processors' do
|
|
212
|
+
it 'returns empty array when no post-processors registered' do
|
|
213
|
+
Kreuzberg.clear_post_processors
|
|
214
|
+
processors = Kreuzberg.list_post_processors
|
|
215
|
+
expect(processors).to be_an(Array)
|
|
216
|
+
expect(processors).to be_empty
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
it 'returns post-processor names after registration' do
|
|
220
|
+
Kreuzberg.clear_post_processors
|
|
221
|
+
processor = lambda do |result|
|
|
222
|
+
result['content'] = result['content'].upcase
|
|
223
|
+
result
|
|
224
|
+
end
|
|
225
|
+
Kreuzberg.register_post_processor('test-processor', processor)
|
|
226
|
+
processors = Kreuzberg.list_post_processors
|
|
227
|
+
expect(processors).to include('test-processor')
|
|
228
|
+
Kreuzberg.clear_post_processors
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
it 'returns all registered post-processor names' do
|
|
232
|
+
Kreuzberg.clear_post_processors
|
|
233
|
+
processor1 = lambda do |result|
|
|
234
|
+
result
|
|
235
|
+
end
|
|
236
|
+
processor2 = lambda do |result|
|
|
237
|
+
result
|
|
238
|
+
end
|
|
239
|
+
processor3 = lambda do |result|
|
|
240
|
+
result
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
Kreuzberg.register_post_processor('processor-one', processor1)
|
|
244
|
+
Kreuzberg.register_post_processor('processor-two', processor2)
|
|
245
|
+
Kreuzberg.register_post_processor('processor-three', processor3)
|
|
246
|
+
|
|
247
|
+
processors = Kreuzberg.list_post_processors
|
|
248
|
+
expect(processors).to contain_exactly('processor-one', 'processor-two', 'processor-three')
|
|
249
|
+
Kreuzberg.clear_post_processors
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
it 'reflects changes after unregistration' do
|
|
253
|
+
Kreuzberg.clear_post_processors
|
|
254
|
+
processor = lambda do |result|
|
|
255
|
+
result
|
|
256
|
+
end
|
|
257
|
+
Kreuzberg.register_post_processor('temp-processor', processor)
|
|
258
|
+
|
|
259
|
+
processors_before = Kreuzberg.list_post_processors
|
|
260
|
+
expect(processors_before).to include('temp-processor')
|
|
261
|
+
|
|
262
|
+
Kreuzberg.unregister_post_processor('temp-processor')
|
|
263
|
+
|
|
264
|
+
processors_after = Kreuzberg.list_post_processors
|
|
265
|
+
expect(processors_after).not_to include('temp-processor')
|
|
266
|
+
Kreuzberg.clear_post_processors
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
end
|