kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,273 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe 'Validator Plugin System' do
6
+ let(:test_pdf) { test_document_path('text/contract_test.txt') }
7
+
8
+ after do
9
+ Kreuzberg.clear_validators
10
+ end
11
+
12
+ describe 'registering validator as Proc' do
13
+ it 'registers and executes Proc validator during extraction' do
14
+ validator_called = false
15
+ validator = lambda do |_result|
16
+ validator_called = true
17
+ end
18
+
19
+ Kreuzberg.register_validator('check_called', validator)
20
+ Kreuzberg.extract_file_sync(path: test_pdf)
21
+
22
+ expect(validator_called).to be true
23
+ end
24
+
25
+ it 'allows extraction to proceed when validator passes' do
26
+ validator = lambda do |result|
27
+ end
28
+
29
+ Kreuzberg.register_validator('pass_validator', validator)
30
+ result = Kreuzberg.extract_file_sync(path: test_pdf)
31
+
32
+ expect(result).to be_a(Kreuzberg::Result)
33
+ expect(result.content).not_to be_empty
34
+ end
35
+
36
+ it 'prevents extraction when validator raises ValidationError' do
37
+ validator = lambda do |result|
38
+ if result['content'].length < 10_000_000
39
+ raise Kreuzberg::Errors::ValidationError, 'Content too short for this test'
40
+ end
41
+ end
42
+
43
+ Kreuzberg.register_validator('min_length', validator)
44
+
45
+ expect do
46
+ Kreuzberg.extract_file_sync(path: test_pdf)
47
+ end.to raise_error(Kreuzberg::Errors::ValidationError, /Content too short/)
48
+ end
49
+ end
50
+
51
+ describe 'registering validator as class' do
52
+ it 'registers and executes class-based validator' do
53
+ class MinimumLengthValidator
54
+ include Kreuzberg::ValidatorProtocol
55
+
56
+ def initialize(min_length)
57
+ @min_length = min_length
58
+ end
59
+
60
+ def call(result)
61
+ return unless result['content'].length < @min_length
62
+
63
+ raise Kreuzberg::Errors::ValidationError, "Content too short: #{result['content'].length} < #{@min_length}"
64
+ end
65
+ end
66
+
67
+ validator = MinimumLengthValidator.new(10)
68
+ Kreuzberg.register_validator('min_length', validator)
69
+ result = Kreuzberg.extract_file_sync(path: test_pdf)
70
+
71
+ expect(result).to be_a(Kreuzberg::Result)
72
+ expect(result.content.length).to be >= 10
73
+ end
74
+
75
+ it 'validates based on content characteristics' do
76
+ class NonEmptyValidator
77
+ include Kreuzberg::ValidatorProtocol
78
+
79
+ def call(result)
80
+ return unless result['content'].strip.empty?
81
+
82
+ raise Kreuzberg::Errors::ValidationError, 'Content cannot be empty'
83
+ end
84
+ end
85
+
86
+ validator = NonEmptyValidator.new
87
+ Kreuzberg.register_validator('non_empty', validator)
88
+ result = Kreuzberg.extract_file_sync(path: test_pdf)
89
+
90
+ expect(result.content.strip).not_to be_empty
91
+ end
92
+ end
93
+
94
+ describe 'validator receives correct parameters' do
95
+ it 'receives result hash with all required fields' do
96
+ received_result = nil
97
+ validator = lambda do |result|
98
+ received_result = result
99
+ end
100
+
101
+ Kreuzberg.register_validator('capture', validator)
102
+ Kreuzberg.extract_file_sync(path: test_pdf)
103
+
104
+ expect(received_result).to be_a(Hash)
105
+ expect(received_result).to have_key('content')
106
+ expect(received_result).to have_key('mime_type')
107
+ expect(received_result).to have_key('metadata')
108
+ expect(received_result).to have_key('tables')
109
+ end
110
+
111
+ it 'receives correct content in result hash' do
112
+ received_content = nil
113
+ validator = lambda do |result|
114
+ received_content = result['content']
115
+ end
116
+
117
+ Kreuzberg.register_validator('capture_content', validator)
118
+ result = Kreuzberg.extract_file_sync(path: test_pdf)
119
+
120
+ expect(received_content).to eq(result.content)
121
+ end
122
+ end
123
+
124
+ describe 'multiple validators' do
125
+ it 'executes all registered validators' do
126
+ validator1_called = false
127
+ validator2_called = false
128
+
129
+ validator1 = lambda do |_result|
130
+ validator1_called = true
131
+ end
132
+
133
+ validator2 = lambda do |_result|
134
+ validator2_called = true
135
+ end
136
+
137
+ Kreuzberg.register_validator('val1', validator1)
138
+ Kreuzberg.register_validator('val2', validator2)
139
+ Kreuzberg.extract_file_sync(path: test_pdf)
140
+
141
+ expect(validator1_called).to be true
142
+ expect(validator2_called).to be true
143
+ end
144
+
145
+ it 'stops execution if any validator fails' do
146
+ validator1 = lambda do |_result|
147
+ raise Kreuzberg::Errors::ValidationError, 'First validator failed'
148
+ end
149
+
150
+ validator2 = lambda do |_result|
151
+ raise StandardError, 'This should not be reached'
152
+ end
153
+
154
+ Kreuzberg.register_validator('fail_first', validator1)
155
+ Kreuzberg.register_validator('never_reached', validator2)
156
+
157
+ expect do
158
+ Kreuzberg.extract_file_sync(path: test_pdf)
159
+ end.to raise_error(Kreuzberg::Errors::ValidationError, /First validator failed/)
160
+ end
161
+ end
162
+
163
+ describe 'unregister_validator' do
164
+ it 'removes a registered validator by name' do
165
+ validator = lambda do |_result|
166
+ raise Kreuzberg::Errors::ValidationError, 'Should not be called'
167
+ end
168
+
169
+ Kreuzberg.register_validator('removable', validator)
170
+ Kreuzberg.unregister_validator('removable')
171
+
172
+ expect do
173
+ Kreuzberg.extract_file_sync(path: test_pdf)
174
+ end.not_to raise_error
175
+ end
176
+
177
+ it 'does not affect other registered validators' do
178
+ validator1_called = false
179
+ validator3_called = false
180
+
181
+ validator1 = lambda do |_result|
182
+ validator1_called = true
183
+ end
184
+
185
+ validator2 = lambda do |_result|
186
+ raise Kreuzberg::Errors::ValidationError, 'Should not be called'
187
+ end
188
+
189
+ validator3 = lambda do |_result|
190
+ validator3_called = true
191
+ end
192
+
193
+ Kreuzberg.register_validator('keep1', validator1)
194
+ Kreuzberg.register_validator('remove', validator2)
195
+ Kreuzberg.register_validator('keep3', validator3)
196
+
197
+ Kreuzberg.unregister_validator('remove')
198
+ Kreuzberg.extract_file_sync(path: test_pdf)
199
+
200
+ expect(validator1_called).to be true
201
+ expect(validator3_called).to be true
202
+ end
203
+ end
204
+
205
+ describe 'clear_validators' do
206
+ it 'removes all registered validators' do
207
+ validator1 = lambda do |_result|
208
+ raise Kreuzberg::Errors::ValidationError, 'Should not be called 1'
209
+ end
210
+
211
+ validator2 = lambda do |_result|
212
+ raise Kreuzberg::Errors::ValidationError, 'Should not be called 2'
213
+ end
214
+
215
+ Kreuzberg.register_validator('val1', validator1)
216
+ Kreuzberg.register_validator('val2', validator2)
217
+
218
+ Kreuzberg.clear_validators
219
+
220
+ expect do
221
+ Kreuzberg.extract_file_sync(path: test_pdf)
222
+ end.not_to raise_error
223
+ end
224
+ end
225
+
226
+ describe 'list_validators' do
227
+ it 'returns empty array when no validators registered' do
228
+ Kreuzberg.clear_validators
229
+ validators = Kreuzberg.list_validators
230
+ expect(validators).to be_an(Array)
231
+ expect(validators).to be_empty
232
+ end
233
+
234
+ it 'returns validator names after registration' do
235
+ Kreuzberg.clear_validators
236
+ validator = ->(result) {}
237
+ Kreuzberg.register_validator('test-validator', validator)
238
+ validators = Kreuzberg.list_validators
239
+ expect(validators).to include('test-validator')
240
+ Kreuzberg.clear_validators
241
+ end
242
+
243
+ it 'returns all registered validator names' do
244
+ Kreuzberg.clear_validators
245
+ validator1 = ->(result) {}
246
+ validator2 = ->(result) {}
247
+ validator3 = ->(result) {}
248
+
249
+ Kreuzberg.register_validator('validator-one', validator1)
250
+ Kreuzberg.register_validator('validator-two', validator2)
251
+ Kreuzberg.register_validator('validator-three', validator3)
252
+
253
+ validators = Kreuzberg.list_validators
254
+ expect(validators).to contain_exactly('validator-one', 'validator-two', 'validator-three')
255
+ Kreuzberg.clear_validators
256
+ end
257
+
258
+ it 'reflects changes after unregistration' do
259
+ Kreuzberg.clear_validators
260
+ validator = ->(result) {}
261
+ Kreuzberg.register_validator('temp-validator', validator)
262
+
263
+ validators_before = Kreuzberg.list_validators
264
+ expect(validators_before).to include('temp-validator')
265
+
266
+ Kreuzberg.unregister_validator('temp-validator')
267
+
268
+ validators_after = Kreuzberg.list_validators
269
+ expect(validators_after).not_to include('temp-validator')
270
+ Kreuzberg.clear_validators
271
+ end
272
+ end
273
+ end