kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe 'Pages Extraction' do
|
|
4
|
+
describe 'Extract Pages' do
|
|
5
|
+
it 'returns pages array when extractPages is true' do
|
|
6
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
7
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
8
|
+
|
|
9
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
10
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
14
|
+
|
|
15
|
+
expect(result).not_to be_nil
|
|
16
|
+
expect(result.pages).not_to be_nil
|
|
17
|
+
expect(result.pages).to be_a(Array)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
it 'returns page numbers for each page' do
|
|
21
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
22
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
23
|
+
|
|
24
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
25
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
29
|
+
|
|
30
|
+
expect(result.pages).not_to be_nil
|
|
31
|
+
result.pages.each do |page|
|
|
32
|
+
expect(page.page_number).to be > 0
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it 'returns page content for each page' do
|
|
37
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
38
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
39
|
+
|
|
40
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
41
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
45
|
+
|
|
46
|
+
expect(result.pages).not_to be_nil
|
|
47
|
+
result.pages.each do |page|
|
|
48
|
+
expect(page.content).not_to be_nil
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it 'returns nil for pages when extractPages is false' do
|
|
53
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
54
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
55
|
+
|
|
56
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
57
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: false)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
61
|
+
|
|
62
|
+
expect(result).not_to be_nil
|
|
63
|
+
expect(result.pages).to be_nil
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
it 'preserves page order' do
|
|
67
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
68
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
69
|
+
|
|
70
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
71
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
75
|
+
|
|
76
|
+
if result.pages && result.pages.length > 1
|
|
77
|
+
(0...(result.pages.length - 1)).each do |i|
|
|
78
|
+
expect(result.pages[i].page_number).to be < result.pages[i + 1].page_number
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
describe 'Insert Page Markers' do
|
|
85
|
+
it 'inserts page markers when insertPageMarkers is true' do
|
|
86
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
87
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
88
|
+
|
|
89
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
90
|
+
pages: Kreuzberg::Config::PageConfig.new(insert_page_markers: true)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
94
|
+
|
|
95
|
+
expect(result).not_to be_nil
|
|
96
|
+
expect(result.content).not_to be_nil
|
|
97
|
+
expect(result.content).to include('<!-- PAGE')
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
it 'does not insert markers when insertPageMarkers is false' do
|
|
101
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
102
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
103
|
+
|
|
104
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
105
|
+
pages: Kreuzberg::Config::PageConfig.new(insert_page_markers: false)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
109
|
+
|
|
110
|
+
expect(result).not_to be_nil
|
|
111
|
+
# Default marker format should not appear when not enabled
|
|
112
|
+
expect(result.content).not_to include('<!-- PAGE')
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
it 'contains page numbers in markers' do
|
|
116
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
117
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
118
|
+
|
|
119
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
120
|
+
pages: Kreuzberg::Config::PageConfig.new(insert_page_markers: true)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
124
|
+
|
|
125
|
+
expect(result.content).not_to be_nil
|
|
126
|
+
# Should contain at least page 1
|
|
127
|
+
expect(result.content).to include('1')
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
it 'inserts multiple markers for multi-page documents' do
|
|
131
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
132
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
133
|
+
|
|
134
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
135
|
+
pages: Kreuzberg::Config::PageConfig.new(insert_page_markers: true)
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
139
|
+
|
|
140
|
+
expect(result.content).not_to be_nil
|
|
141
|
+
marker_count = result.content.scan('<!-- PAGE').length
|
|
142
|
+
expect(marker_count).to be > 0
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
describe 'Custom Marker Format' do
|
|
147
|
+
it 'uses custom marker format when specified' do
|
|
148
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
149
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
150
|
+
|
|
151
|
+
custom_format = '=== PAGE {page_num} ==='
|
|
152
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
153
|
+
pages: Kreuzberg::Config::PageConfig.new(
|
|
154
|
+
insert_page_markers: true,
|
|
155
|
+
marker_format: custom_format
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
160
|
+
|
|
161
|
+
expect(result).not_to be_nil
|
|
162
|
+
expect(result.content).not_to be_nil
|
|
163
|
+
expect(result.content).to include('=== PAGE')
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
it 'replaces page_num placeholder in custom format' do
|
|
167
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
168
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
169
|
+
|
|
170
|
+
custom_format = '[Page Number: {page_num}]'
|
|
171
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
172
|
+
pages: Kreuzberg::Config::PageConfig.new(
|
|
173
|
+
insert_page_markers: true,
|
|
174
|
+
marker_format: custom_format
|
|
175
|
+
)
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
179
|
+
|
|
180
|
+
expect(result.content).not_to be_nil
|
|
181
|
+
expect(result.content).to include('[Page Number:')
|
|
182
|
+
expect(result.content).not_to include('{page_num}')
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
it 'handles simple custom format' do
|
|
186
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
187
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
188
|
+
|
|
189
|
+
custom_format = 'PAGE_{page_num}'
|
|
190
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
191
|
+
pages: Kreuzberg::Config::PageConfig.new(
|
|
192
|
+
insert_page_markers: true,
|
|
193
|
+
marker_format: custom_format
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
198
|
+
|
|
199
|
+
expect(result.content).not_to be_nil
|
|
200
|
+
expect(result.content).to include('PAGE_')
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
it 'handles custom format with line separators' do
|
|
204
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
205
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
206
|
+
|
|
207
|
+
custom_format = "\n---PAGE {page_num}---\n"
|
|
208
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
209
|
+
pages: Kreuzberg::Config::PageConfig.new(
|
|
210
|
+
insert_page_markers: true,
|
|
211
|
+
marker_format: custom_format
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
216
|
+
|
|
217
|
+
expect(result.content).not_to be_nil
|
|
218
|
+
expect(result.content).to include('---PAGE')
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
it 'overrides default marker format' do
|
|
222
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
223
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
224
|
+
|
|
225
|
+
custom_format = 'CUSTOM_PAGE_{page_num}'
|
|
226
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
227
|
+
pages: Kreuzberg::Config::PageConfig.new(
|
|
228
|
+
insert_page_markers: true,
|
|
229
|
+
marker_format: custom_format
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
234
|
+
|
|
235
|
+
expect(result.content).not_to be_nil
|
|
236
|
+
expect(result.content).to include('CUSTOM_PAGE_')
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
describe 'Multi-Page PDF' do
|
|
241
|
+
it 'produces multiple pages from multi-page PDF' do
|
|
242
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
243
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
244
|
+
|
|
245
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
246
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
250
|
+
|
|
251
|
+
expect(result.pages).not_to be_nil
|
|
252
|
+
expect(result.pages.length).to be > 0
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
it 'page numbers are sequential' do
|
|
256
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
257
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
258
|
+
|
|
259
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
260
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
264
|
+
|
|
265
|
+
expect(result.pages).not_to be_nil
|
|
266
|
+
result.pages.each_with_index do |page, index|
|
|
267
|
+
expect(page.page_number).to eq(index + 1)
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
it 'each page has content' do
|
|
272
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
273
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
274
|
+
|
|
275
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
276
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
280
|
+
|
|
281
|
+
expect(result.pages).not_to be_nil
|
|
282
|
+
result.pages.each do |page|
|
|
283
|
+
expect(page.content).not_to be_nil
|
|
284
|
+
expect(page.content.strip).not_to be_empty
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
it 'with markers contains all pages' do
|
|
289
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
290
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
291
|
+
|
|
292
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
293
|
+
pages: Kreuzberg::Config::PageConfig.new(insert_page_markers: true)
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
297
|
+
|
|
298
|
+
expect(result.content).not_to be_nil
|
|
299
|
+
marker_count = result.content.scan('<!-- PAGE').length
|
|
300
|
+
expect(marker_count).to be >= 1
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
describe 'Page Content Structure Validation' do
|
|
305
|
+
it 'validates page structure' do
|
|
306
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
307
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
308
|
+
|
|
309
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
310
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
314
|
+
|
|
315
|
+
expect(result.pages).not_to be_nil
|
|
316
|
+
result.pages.each do |page|
|
|
317
|
+
expect(page.content).not_to be_nil
|
|
318
|
+
expect(page.page_number).to be > 0
|
|
319
|
+
end
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
it 'page content has required fields' do
|
|
323
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
324
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
325
|
+
|
|
326
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
327
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
331
|
+
|
|
332
|
+
expect(result.pages).not_to be_nil
|
|
333
|
+
result.pages.each do |page|
|
|
334
|
+
expect(page.page_number).to be > 0
|
|
335
|
+
expect(page.content).not_to be_nil
|
|
336
|
+
# is_blank should be nil or a boolean
|
|
337
|
+
expect(page.is_blank).to be_nil.or be(true).or be(false) if page.respond_to?(:is_blank)
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
it 'page content with tables preserves table data' do
|
|
342
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
343
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
344
|
+
|
|
345
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
346
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
350
|
+
|
|
351
|
+
expect(result.pages).not_to be_nil
|
|
352
|
+
result.pages.each do |page|
|
|
353
|
+
# Tables in page content are optional
|
|
354
|
+
expect(page.tables).to be_an(Array) if page.respond_to?(:tables) && page.tables
|
|
355
|
+
end
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
it 'page content with images preserves image data' do
|
|
359
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
360
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
361
|
+
|
|
362
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
363
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
367
|
+
|
|
368
|
+
expect(result.pages).not_to be_nil
|
|
369
|
+
result.pages.each do |page|
|
|
370
|
+
# Images in page content are optional
|
|
371
|
+
expect(page.images).to be_an(Array) if page.respond_to?(:images) && page.images
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
it 'page content is not empty' do
|
|
376
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
377
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
378
|
+
|
|
379
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
380
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
384
|
+
|
|
385
|
+
expect(result.pages).not_to be_nil
|
|
386
|
+
page_with_content = result.pages.find { |p| p.content && !p.content.strip.empty? }
|
|
387
|
+
expect(page_with_content).not_to be_nil
|
|
388
|
+
end
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
describe 'Combined Features' do
|
|
392
|
+
it 'extract pages and insert markers together' do
|
|
393
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
394
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
395
|
+
|
|
396
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
397
|
+
pages: Kreuzberg::Config::PageConfig.new(
|
|
398
|
+
extract_pages: true,
|
|
399
|
+
insert_page_markers: true
|
|
400
|
+
)
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
404
|
+
|
|
405
|
+
expect(result).not_to be_nil
|
|
406
|
+
expect(result.pages).not_to be_nil
|
|
407
|
+
expect(result.pages.length).to be > 0
|
|
408
|
+
expect(result.content).to include('<!-- PAGE')
|
|
409
|
+
end
|
|
410
|
+
|
|
411
|
+
it 'extract pages with custom marker format' do
|
|
412
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
413
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
414
|
+
|
|
415
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
416
|
+
pages: Kreuzberg::Config::PageConfig.new(
|
|
417
|
+
extract_pages: true,
|
|
418
|
+
insert_page_markers: true,
|
|
419
|
+
marker_format: '[PAGE {page_num}]'
|
|
420
|
+
)
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
424
|
+
|
|
425
|
+
expect(result.pages).not_to be_nil
|
|
426
|
+
expect(result.pages.length).to be > 0
|
|
427
|
+
expect(result.content).to include('[PAGE')
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
it 'page extraction consistency between array and markers' do
|
|
431
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
432
|
+
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
433
|
+
|
|
434
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
435
|
+
pages: Kreuzberg::Config::PageConfig.new(
|
|
436
|
+
extract_pages: true,
|
|
437
|
+
insert_page_markers: true
|
|
438
|
+
)
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
|
|
442
|
+
|
|
443
|
+
expect(result.pages).not_to be_nil
|
|
444
|
+
expect(result.content).not_to be_nil
|
|
445
|
+
|
|
446
|
+
page_array_count = result.pages.length
|
|
447
|
+
marker_count = result.content.scan('<!-- PAGE').length
|
|
448
|
+
|
|
449
|
+
expect(page_array_count).to eq(marker_count)
|
|
450
|
+
end
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
describe 'PageConfig' do
|
|
454
|
+
it 'creates with default values' do
|
|
455
|
+
config = Kreuzberg::Config::PageConfig.new
|
|
456
|
+
|
|
457
|
+
expect(config.extract_pages).to be false
|
|
458
|
+
expect(config.insert_page_markers).to be false
|
|
459
|
+
expect(config.marker_format).to match(/<!-- PAGE/)
|
|
460
|
+
end
|
|
461
|
+
|
|
462
|
+
it 'creates with custom values' do
|
|
463
|
+
config = Kreuzberg::Config::PageConfig.new(
|
|
464
|
+
extract_pages: true,
|
|
465
|
+
insert_page_markers: true,
|
|
466
|
+
marker_format: 'CUSTOM_{page_num}'
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
expect(config.extract_pages).to be true
|
|
470
|
+
expect(config.insert_page_markers).to be true
|
|
471
|
+
expect(config.marker_format).to eq('CUSTOM_{page_num}')
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
it 'converts to hash' do
|
|
475
|
+
config = Kreuzberg::Config::PageConfig.new(
|
|
476
|
+
extract_pages: true,
|
|
477
|
+
insert_page_markers: false,
|
|
478
|
+
marker_format: 'TEST_{page_num}'
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
hash = config.to_h
|
|
482
|
+
|
|
483
|
+
expect(hash).to be_a(Hash)
|
|
484
|
+
expect(hash[:extract_pages]).to be true
|
|
485
|
+
expect(hash[:insert_page_markers]).to be false
|
|
486
|
+
expect(hash[:marker_format]).to eq('TEST_{page_num}')
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
it 'handles boolean conversion' do
|
|
490
|
+
config = Kreuzberg::Config::PageConfig.new(
|
|
491
|
+
extract_pages: 1,
|
|
492
|
+
insert_page_markers: 0
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
expect(config.extract_pages).to be true
|
|
496
|
+
expect(config.insert_page_markers).to be false
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
it 'preserves marker format default' do
|
|
500
|
+
config = Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
501
|
+
|
|
502
|
+
expect(config.marker_format).not_to be_nil
|
|
503
|
+
expect(config.marker_format).to match(/<!-- PAGE/)
|
|
504
|
+
end
|
|
505
|
+
end
|
|
506
|
+
|
|
507
|
+
describe 'Integration Tests' do
|
|
508
|
+
it 'extraction config includes pages config' do
|
|
509
|
+
extraction_config = Kreuzberg::Config::Extraction.new(
|
|
510
|
+
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
expect(extraction_config.pages).not_to be_nil
|
|
514
|
+
expect(extraction_config.pages).to be_a(Kreuzberg::Config::PageConfig)
|
|
515
|
+
expect(extraction_config.pages.extract_pages).to be true
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
it 'extraction config to_h includes pages' do
|
|
519
|
+
pages_config = Kreuzberg::Config::PageConfig.new(
|
|
520
|
+
extract_pages: true,
|
|
521
|
+
insert_page_markers: true,
|
|
522
|
+
marker_format: 'CUSTOM_{page_num}'
|
|
523
|
+
)
|
|
524
|
+
extraction_config = Kreuzberg::Config::Extraction.new(pages: pages_config)
|
|
525
|
+
|
|
526
|
+
hash = extraction_config.to_h
|
|
527
|
+
|
|
528
|
+
expect(hash).to include(:pages)
|
|
529
|
+
expect(hash[:pages]).to be_a(Hash)
|
|
530
|
+
expect(hash[:pages][:extract_pages]).to be true
|
|
531
|
+
expect(hash[:pages][:insert_page_markers]).to be true
|
|
532
|
+
expect(hash[:pages][:marker_format]).to eq('CUSTOM_{page_num}')
|
|
533
|
+
end
|
|
534
|
+
|
|
535
|
+
it 'accepts pages config as hash in extraction config' do
|
|
536
|
+
extraction_config = Kreuzberg::Config::Extraction.new(
|
|
537
|
+
pages: {
|
|
538
|
+
extract_pages: true,
|
|
539
|
+
insert_page_markers: true,
|
|
540
|
+
marker_format: 'HASH_{page_num}'
|
|
541
|
+
}
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
expect(extraction_config.pages).to be_a(Kreuzberg::Config::PageConfig)
|
|
545
|
+
expect(extraction_config.pages.extract_pages).to be true
|
|
546
|
+
expect(extraction_config.pages.insert_page_markers).to be true
|
|
547
|
+
expect(extraction_config.pages.marker_format).to eq('HASH_{page_num}')
|
|
548
|
+
end
|
|
549
|
+
end
|
|
550
|
+
end
|