kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe 'Validator Plugin System' do
|
|
6
|
+
let(:test_pdf) { test_document_path('text/contract_test.txt') }
|
|
7
|
+
|
|
8
|
+
after do
|
|
9
|
+
Kreuzberg.clear_validators
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
describe 'registering validator as Proc' do
|
|
13
|
+
it 'registers and executes Proc validator during extraction' do
|
|
14
|
+
validator_called = false
|
|
15
|
+
validator = lambda do |_result|
|
|
16
|
+
validator_called = true
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
Kreuzberg.register_validator('check_called', validator)
|
|
20
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
21
|
+
|
|
22
|
+
expect(validator_called).to be true
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'allows extraction to proceed when validator passes' do
|
|
26
|
+
validator = lambda do |result|
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
Kreuzberg.register_validator('pass_validator', validator)
|
|
30
|
+
result = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
31
|
+
|
|
32
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
33
|
+
expect(result.content).not_to be_empty
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it 'prevents extraction when validator raises ValidationError' do
|
|
37
|
+
validator = lambda do |result|
|
|
38
|
+
if result['content'].length < 10_000_000
|
|
39
|
+
raise Kreuzberg::Errors::ValidationError, 'Content too short for this test'
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
Kreuzberg.register_validator('min_length', validator)
|
|
44
|
+
|
|
45
|
+
expect do
|
|
46
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
47
|
+
end.to raise_error(Kreuzberg::Errors::ValidationError, /Content too short/)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
describe 'registering validator as class' do
|
|
52
|
+
it 'registers and executes class-based validator' do
|
|
53
|
+
class MinimumLengthValidator
|
|
54
|
+
include Kreuzberg::ValidatorProtocol
|
|
55
|
+
|
|
56
|
+
def initialize(min_length)
|
|
57
|
+
@min_length = min_length
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def call(result)
|
|
61
|
+
return unless result['content'].length < @min_length
|
|
62
|
+
|
|
63
|
+
raise Kreuzberg::Errors::ValidationError, "Content too short: #{result['content'].length} < #{@min_length}"
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
validator = MinimumLengthValidator.new(10)
|
|
68
|
+
Kreuzberg.register_validator('min_length', validator)
|
|
69
|
+
result = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
70
|
+
|
|
71
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
72
|
+
expect(result.content.length).to be >= 10
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
it 'validates based on content characteristics' do
|
|
76
|
+
class NonEmptyValidator
|
|
77
|
+
include Kreuzberg::ValidatorProtocol
|
|
78
|
+
|
|
79
|
+
def call(result)
|
|
80
|
+
return unless result['content'].strip.empty?
|
|
81
|
+
|
|
82
|
+
raise Kreuzberg::Errors::ValidationError, 'Content cannot be empty'
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
validator = NonEmptyValidator.new
|
|
87
|
+
Kreuzberg.register_validator('non_empty', validator)
|
|
88
|
+
result = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
89
|
+
|
|
90
|
+
expect(result.content.strip).not_to be_empty
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
describe 'validator receives correct parameters' do
|
|
95
|
+
it 'receives result hash with all required fields' do
|
|
96
|
+
received_result = nil
|
|
97
|
+
validator = lambda do |result|
|
|
98
|
+
received_result = result
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
Kreuzberg.register_validator('capture', validator)
|
|
102
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
103
|
+
|
|
104
|
+
expect(received_result).to be_a(Hash)
|
|
105
|
+
expect(received_result).to have_key('content')
|
|
106
|
+
expect(received_result).to have_key('mime_type')
|
|
107
|
+
expect(received_result).to have_key('metadata')
|
|
108
|
+
expect(received_result).to have_key('tables')
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
it 'receives correct content in result hash' do
|
|
112
|
+
received_content = nil
|
|
113
|
+
validator = lambda do |result|
|
|
114
|
+
received_content = result['content']
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
Kreuzberg.register_validator('capture_content', validator)
|
|
118
|
+
result = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
119
|
+
|
|
120
|
+
expect(received_content).to eq(result.content)
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
describe 'multiple validators' do
|
|
125
|
+
it 'executes all registered validators' do
|
|
126
|
+
validator1_called = false
|
|
127
|
+
validator2_called = false
|
|
128
|
+
|
|
129
|
+
validator1 = lambda do |_result|
|
|
130
|
+
validator1_called = true
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
validator2 = lambda do |_result|
|
|
134
|
+
validator2_called = true
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
Kreuzberg.register_validator('val1', validator1)
|
|
138
|
+
Kreuzberg.register_validator('val2', validator2)
|
|
139
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
140
|
+
|
|
141
|
+
expect(validator1_called).to be true
|
|
142
|
+
expect(validator2_called).to be true
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
it 'stops execution if any validator fails' do
|
|
146
|
+
validator1 = lambda do |_result|
|
|
147
|
+
raise Kreuzberg::Errors::ValidationError, 'First validator failed'
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
validator2 = lambda do |_result|
|
|
151
|
+
raise StandardError, 'This should not be reached'
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
Kreuzberg.register_validator('fail_first', validator1)
|
|
155
|
+
Kreuzberg.register_validator('never_reached', validator2)
|
|
156
|
+
|
|
157
|
+
expect do
|
|
158
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
159
|
+
end.to raise_error(Kreuzberg::Errors::ValidationError, /First validator failed/)
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
describe 'unregister_validator' do
|
|
164
|
+
it 'removes a registered validator by name' do
|
|
165
|
+
validator = lambda do |_result|
|
|
166
|
+
raise Kreuzberg::Errors::ValidationError, 'Should not be called'
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
Kreuzberg.register_validator('removable', validator)
|
|
170
|
+
Kreuzberg.unregister_validator('removable')
|
|
171
|
+
|
|
172
|
+
expect do
|
|
173
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
174
|
+
end.not_to raise_error
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
it 'does not affect other registered validators' do
|
|
178
|
+
validator1_called = false
|
|
179
|
+
validator3_called = false
|
|
180
|
+
|
|
181
|
+
validator1 = lambda do |_result|
|
|
182
|
+
validator1_called = true
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
validator2 = lambda do |_result|
|
|
186
|
+
raise Kreuzberg::Errors::ValidationError, 'Should not be called'
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
validator3 = lambda do |_result|
|
|
190
|
+
validator3_called = true
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
Kreuzberg.register_validator('keep1', validator1)
|
|
194
|
+
Kreuzberg.register_validator('remove', validator2)
|
|
195
|
+
Kreuzberg.register_validator('keep3', validator3)
|
|
196
|
+
|
|
197
|
+
Kreuzberg.unregister_validator('remove')
|
|
198
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
199
|
+
|
|
200
|
+
expect(validator1_called).to be true
|
|
201
|
+
expect(validator3_called).to be true
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
describe 'clear_validators' do
|
|
206
|
+
it 'removes all registered validators' do
|
|
207
|
+
validator1 = lambda do |_result|
|
|
208
|
+
raise Kreuzberg::Errors::ValidationError, 'Should not be called 1'
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
validator2 = lambda do |_result|
|
|
212
|
+
raise Kreuzberg::Errors::ValidationError, 'Should not be called 2'
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
Kreuzberg.register_validator('val1', validator1)
|
|
216
|
+
Kreuzberg.register_validator('val2', validator2)
|
|
217
|
+
|
|
218
|
+
Kreuzberg.clear_validators
|
|
219
|
+
|
|
220
|
+
expect do
|
|
221
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
222
|
+
end.not_to raise_error
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
describe 'list_validators' do
|
|
227
|
+
it 'returns empty array when no validators registered' do
|
|
228
|
+
Kreuzberg.clear_validators
|
|
229
|
+
validators = Kreuzberg.list_validators
|
|
230
|
+
expect(validators).to be_an(Array)
|
|
231
|
+
expect(validators).to be_empty
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
it 'returns validator names after registration' do
|
|
235
|
+
Kreuzberg.clear_validators
|
|
236
|
+
validator = ->(result) {}
|
|
237
|
+
Kreuzberg.register_validator('test-validator', validator)
|
|
238
|
+
validators = Kreuzberg.list_validators
|
|
239
|
+
expect(validators).to include('test-validator')
|
|
240
|
+
Kreuzberg.clear_validators
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
it 'returns all registered validator names' do
|
|
244
|
+
Kreuzberg.clear_validators
|
|
245
|
+
validator1 = ->(result) {}
|
|
246
|
+
validator2 = ->(result) {}
|
|
247
|
+
validator3 = ->(result) {}
|
|
248
|
+
|
|
249
|
+
Kreuzberg.register_validator('validator-one', validator1)
|
|
250
|
+
Kreuzberg.register_validator('validator-two', validator2)
|
|
251
|
+
Kreuzberg.register_validator('validator-three', validator3)
|
|
252
|
+
|
|
253
|
+
validators = Kreuzberg.list_validators
|
|
254
|
+
expect(validators).to contain_exactly('validator-one', 'validator-two', 'validator-three')
|
|
255
|
+
Kreuzberg.clear_validators
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
it 'reflects changes after unregistration' do
|
|
259
|
+
Kreuzberg.clear_validators
|
|
260
|
+
validator = ->(result) {}
|
|
261
|
+
Kreuzberg.register_validator('temp-validator', validator)
|
|
262
|
+
|
|
263
|
+
validators_before = Kreuzberg.list_validators
|
|
264
|
+
expect(validators_before).to include('temp-validator')
|
|
265
|
+
|
|
266
|
+
Kreuzberg.unregister_validator('temp-validator')
|
|
267
|
+
|
|
268
|
+
validators_after = Kreuzberg.list_validators
|
|
269
|
+
expect(validators_after).not_to include('temp-validator')
|
|
270
|
+
Kreuzberg.clear_validators
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
end
|