kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,550 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe 'Pages Extraction' do
4
+ describe 'Extract Pages' do
5
+ it 'returns pages array when extractPages is true' do
6
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
7
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
8
+
9
+ config = Kreuzberg::Config::Extraction.new(
10
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
11
+ )
12
+
13
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
14
+
15
+ expect(result).not_to be_nil
16
+ expect(result.pages).not_to be_nil
17
+ expect(result.pages).to be_a(Array)
18
+ end
19
+
20
+ it 'returns page numbers for each page' do
21
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
22
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
23
+
24
+ config = Kreuzberg::Config::Extraction.new(
25
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
26
+ )
27
+
28
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
29
+
30
+ expect(result.pages).not_to be_nil
31
+ result.pages.each do |page|
32
+ expect(page.page_number).to be > 0
33
+ end
34
+ end
35
+
36
+ it 'returns page content for each page' do
37
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
38
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
39
+
40
+ config = Kreuzberg::Config::Extraction.new(
41
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
42
+ )
43
+
44
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
45
+
46
+ expect(result.pages).not_to be_nil
47
+ result.pages.each do |page|
48
+ expect(page.content).not_to be_nil
49
+ end
50
+ end
51
+
52
+ it 'returns nil for pages when extractPages is false' do
53
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
54
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
55
+
56
+ config = Kreuzberg::Config::Extraction.new(
57
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: false)
58
+ )
59
+
60
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
61
+
62
+ expect(result).not_to be_nil
63
+ expect(result.pages).to be_nil
64
+ end
65
+
66
+ it 'preserves page order' do
67
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
68
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
69
+
70
+ config = Kreuzberg::Config::Extraction.new(
71
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
72
+ )
73
+
74
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
75
+
76
+ if result.pages && result.pages.length > 1
77
+ (0...(result.pages.length - 1)).each do |i|
78
+ expect(result.pages[i].page_number).to be < result.pages[i + 1].page_number
79
+ end
80
+ end
81
+ end
82
+ end
83
+
84
+ describe 'Insert Page Markers' do
85
+ it 'inserts page markers when insertPageMarkers is true' do
86
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
87
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
88
+
89
+ config = Kreuzberg::Config::Extraction.new(
90
+ pages: Kreuzberg::Config::PageConfig.new(insert_page_markers: true)
91
+ )
92
+
93
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
94
+
95
+ expect(result).not_to be_nil
96
+ expect(result.content).not_to be_nil
97
+ expect(result.content).to include('<!-- PAGE')
98
+ end
99
+
100
+ it 'does not insert markers when insertPageMarkers is false' do
101
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
102
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
103
+
104
+ config = Kreuzberg::Config::Extraction.new(
105
+ pages: Kreuzberg::Config::PageConfig.new(insert_page_markers: false)
106
+ )
107
+
108
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
109
+
110
+ expect(result).not_to be_nil
111
+ # Default marker format should not appear when not enabled
112
+ expect(result.content).not_to include('<!-- PAGE')
113
+ end
114
+
115
+ it 'contains page numbers in markers' do
116
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
117
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
118
+
119
+ config = Kreuzberg::Config::Extraction.new(
120
+ pages: Kreuzberg::Config::PageConfig.new(insert_page_markers: true)
121
+ )
122
+
123
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
124
+
125
+ expect(result.content).not_to be_nil
126
+ # Should contain at least page 1
127
+ expect(result.content).to include('1')
128
+ end
129
+
130
+ it 'inserts multiple markers for multi-page documents' do
131
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
132
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
133
+
134
+ config = Kreuzberg::Config::Extraction.new(
135
+ pages: Kreuzberg::Config::PageConfig.new(insert_page_markers: true)
136
+ )
137
+
138
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
139
+
140
+ expect(result.content).not_to be_nil
141
+ marker_count = result.content.scan('<!-- PAGE').length
142
+ expect(marker_count).to be > 0
143
+ end
144
+ end
145
+
146
+ describe 'Custom Marker Format' do
147
+ it 'uses custom marker format when specified' do
148
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
149
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
150
+
151
+ custom_format = '=== PAGE {page_num} ==='
152
+ config = Kreuzberg::Config::Extraction.new(
153
+ pages: Kreuzberg::Config::PageConfig.new(
154
+ insert_page_markers: true,
155
+ marker_format: custom_format
156
+ )
157
+ )
158
+
159
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
160
+
161
+ expect(result).not_to be_nil
162
+ expect(result.content).not_to be_nil
163
+ expect(result.content).to include('=== PAGE')
164
+ end
165
+
166
+ it 'replaces page_num placeholder in custom format' do
167
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
168
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
169
+
170
+ custom_format = '[Page Number: {page_num}]'
171
+ config = Kreuzberg::Config::Extraction.new(
172
+ pages: Kreuzberg::Config::PageConfig.new(
173
+ insert_page_markers: true,
174
+ marker_format: custom_format
175
+ )
176
+ )
177
+
178
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
179
+
180
+ expect(result.content).not_to be_nil
181
+ expect(result.content).to include('[Page Number:')
182
+ expect(result.content).not_to include('{page_num}')
183
+ end
184
+
185
+ it 'handles simple custom format' do
186
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
187
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
188
+
189
+ custom_format = 'PAGE_{page_num}'
190
+ config = Kreuzberg::Config::Extraction.new(
191
+ pages: Kreuzberg::Config::PageConfig.new(
192
+ insert_page_markers: true,
193
+ marker_format: custom_format
194
+ )
195
+ )
196
+
197
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
198
+
199
+ expect(result.content).not_to be_nil
200
+ expect(result.content).to include('PAGE_')
201
+ end
202
+
203
+ it 'handles custom format with line separators' do
204
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
205
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
206
+
207
+ custom_format = "\n---PAGE {page_num}---\n"
208
+ config = Kreuzberg::Config::Extraction.new(
209
+ pages: Kreuzberg::Config::PageConfig.new(
210
+ insert_page_markers: true,
211
+ marker_format: custom_format
212
+ )
213
+ )
214
+
215
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
216
+
217
+ expect(result.content).not_to be_nil
218
+ expect(result.content).to include('---PAGE')
219
+ end
220
+
221
+ it 'overrides default marker format' do
222
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
223
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
224
+
225
+ custom_format = 'CUSTOM_PAGE_{page_num}'
226
+ config = Kreuzberg::Config::Extraction.new(
227
+ pages: Kreuzberg::Config::PageConfig.new(
228
+ insert_page_markers: true,
229
+ marker_format: custom_format
230
+ )
231
+ )
232
+
233
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
234
+
235
+ expect(result.content).not_to be_nil
236
+ expect(result.content).to include('CUSTOM_PAGE_')
237
+ end
238
+ end
239
+
240
+ describe 'Multi-Page PDF' do
241
+ it 'produces multiple pages from multi-page PDF' do
242
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
243
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
244
+
245
+ config = Kreuzberg::Config::Extraction.new(
246
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
247
+ )
248
+
249
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
250
+
251
+ expect(result.pages).not_to be_nil
252
+ expect(result.pages.length).to be > 0
253
+ end
254
+
255
+ it 'page numbers are sequential' do
256
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
257
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
258
+
259
+ config = Kreuzberg::Config::Extraction.new(
260
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
261
+ )
262
+
263
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
264
+
265
+ expect(result.pages).not_to be_nil
266
+ result.pages.each_with_index do |page, index|
267
+ expect(page.page_number).to eq(index + 1)
268
+ end
269
+ end
270
+
271
+ it 'each page has content' do
272
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
273
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
274
+
275
+ config = Kreuzberg::Config::Extraction.new(
276
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
277
+ )
278
+
279
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
280
+
281
+ expect(result.pages).not_to be_nil
282
+ result.pages.each do |page|
283
+ expect(page.content).not_to be_nil
284
+ expect(page.content.strip).not_to be_empty
285
+ end
286
+ end
287
+
288
+ it 'with markers contains all pages' do
289
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
290
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
291
+
292
+ config = Kreuzberg::Config::Extraction.new(
293
+ pages: Kreuzberg::Config::PageConfig.new(insert_page_markers: true)
294
+ )
295
+
296
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
297
+
298
+ expect(result.content).not_to be_nil
299
+ marker_count = result.content.scan('<!-- PAGE').length
300
+ expect(marker_count).to be >= 1
301
+ end
302
+ end
303
+
304
+ describe 'Page Content Structure Validation' do
305
+ it 'validates page structure' do
306
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
307
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
308
+
309
+ config = Kreuzberg::Config::Extraction.new(
310
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
311
+ )
312
+
313
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
314
+
315
+ expect(result.pages).not_to be_nil
316
+ result.pages.each do |page|
317
+ expect(page.content).not_to be_nil
318
+ expect(page.page_number).to be > 0
319
+ end
320
+ end
321
+
322
+ it 'page content has required fields' do
323
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
324
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
325
+
326
+ config = Kreuzberg::Config::Extraction.new(
327
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
328
+ )
329
+
330
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
331
+
332
+ expect(result.pages).not_to be_nil
333
+ result.pages.each do |page|
334
+ expect(page.page_number).to be > 0
335
+ expect(page.content).not_to be_nil
336
+ # is_blank should be nil or a boolean
337
+ expect(page.is_blank).to be_nil.or be(true).or be(false) if page.respond_to?(:is_blank)
338
+ end
339
+ end
340
+
341
+ it 'page content with tables preserves table data' do
342
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
343
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
344
+
345
+ config = Kreuzberg::Config::Extraction.new(
346
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
347
+ )
348
+
349
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
350
+
351
+ expect(result.pages).not_to be_nil
352
+ result.pages.each do |page|
353
+ # Tables in page content are optional
354
+ expect(page.tables).to be_an(Array) if page.respond_to?(:tables) && page.tables
355
+ end
356
+ end
357
+
358
+ it 'page content with images preserves image data' do
359
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
360
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
361
+
362
+ config = Kreuzberg::Config::Extraction.new(
363
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
364
+ )
365
+
366
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
367
+
368
+ expect(result.pages).not_to be_nil
369
+ result.pages.each do |page|
370
+ # Images in page content are optional
371
+ expect(page.images).to be_an(Array) if page.respond_to?(:images) && page.images
372
+ end
373
+ end
374
+
375
+ it 'page content is not empty' do
376
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
377
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
378
+
379
+ config = Kreuzberg::Config::Extraction.new(
380
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
381
+ )
382
+
383
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
384
+
385
+ expect(result.pages).not_to be_nil
386
+ page_with_content = result.pages.find { |p| p.content && !p.content.strip.empty? }
387
+ expect(page_with_content).not_to be_nil
388
+ end
389
+ end
390
+
391
+ describe 'Combined Features' do
392
+ it 'extract pages and insert markers together' do
393
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
394
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
395
+
396
+ config = Kreuzberg::Config::Extraction.new(
397
+ pages: Kreuzberg::Config::PageConfig.new(
398
+ extract_pages: true,
399
+ insert_page_markers: true
400
+ )
401
+ )
402
+
403
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
404
+
405
+ expect(result).not_to be_nil
406
+ expect(result.pages).not_to be_nil
407
+ expect(result.pages.length).to be > 0
408
+ expect(result.content).to include('<!-- PAGE')
409
+ end
410
+
411
+ it 'extract pages with custom marker format' do
412
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
413
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
414
+
415
+ config = Kreuzberg::Config::Extraction.new(
416
+ pages: Kreuzberg::Config::PageConfig.new(
417
+ extract_pages: true,
418
+ insert_page_markers: true,
419
+ marker_format: '[PAGE {page_num}]'
420
+ )
421
+ )
422
+
423
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
424
+
425
+ expect(result.pages).not_to be_nil
426
+ expect(result.pages.length).to be > 0
427
+ expect(result.content).to include('[PAGE')
428
+ end
429
+
430
+ it 'page extraction consistency between array and markers' do
431
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
432
+ skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
433
+
434
+ config = Kreuzberg::Config::Extraction.new(
435
+ pages: Kreuzberg::Config::PageConfig.new(
436
+ extract_pages: true,
437
+ insert_page_markers: true
438
+ )
439
+ )
440
+
441
+ result = Kreuzberg.extract_file_sync(path: pdf_file, config: config)
442
+
443
+ expect(result.pages).not_to be_nil
444
+ expect(result.content).not_to be_nil
445
+
446
+ page_array_count = result.pages.length
447
+ marker_count = result.content.scan('<!-- PAGE').length
448
+
449
+ expect(page_array_count).to eq(marker_count)
450
+ end
451
+ end
452
+
453
+ describe 'PageConfig' do
454
+ it 'creates with default values' do
455
+ config = Kreuzberg::Config::PageConfig.new
456
+
457
+ expect(config.extract_pages).to be false
458
+ expect(config.insert_page_markers).to be false
459
+ expect(config.marker_format).to match(/<!-- PAGE/)
460
+ end
461
+
462
+ it 'creates with custom values' do
463
+ config = Kreuzberg::Config::PageConfig.new(
464
+ extract_pages: true,
465
+ insert_page_markers: true,
466
+ marker_format: 'CUSTOM_{page_num}'
467
+ )
468
+
469
+ expect(config.extract_pages).to be true
470
+ expect(config.insert_page_markers).to be true
471
+ expect(config.marker_format).to eq('CUSTOM_{page_num}')
472
+ end
473
+
474
+ it 'converts to hash' do
475
+ config = Kreuzberg::Config::PageConfig.new(
476
+ extract_pages: true,
477
+ insert_page_markers: false,
478
+ marker_format: 'TEST_{page_num}'
479
+ )
480
+
481
+ hash = config.to_h
482
+
483
+ expect(hash).to be_a(Hash)
484
+ expect(hash[:extract_pages]).to be true
485
+ expect(hash[:insert_page_markers]).to be false
486
+ expect(hash[:marker_format]).to eq('TEST_{page_num}')
487
+ end
488
+
489
+ it 'handles boolean conversion' do
490
+ config = Kreuzberg::Config::PageConfig.new(
491
+ extract_pages: 1,
492
+ insert_page_markers: 0
493
+ )
494
+
495
+ expect(config.extract_pages).to be true
496
+ expect(config.insert_page_markers).to be false
497
+ end
498
+
499
+ it 'preserves marker format default' do
500
+ config = Kreuzberg::Config::PageConfig.new(extract_pages: true)
501
+
502
+ expect(config.marker_format).not_to be_nil
503
+ expect(config.marker_format).to match(/<!-- PAGE/)
504
+ end
505
+ end
506
+
507
+ describe 'Integration Tests' do
508
+ it 'extraction config includes pages config' do
509
+ extraction_config = Kreuzberg::Config::Extraction.new(
510
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
511
+ )
512
+
513
+ expect(extraction_config.pages).not_to be_nil
514
+ expect(extraction_config.pages).to be_a(Kreuzberg::Config::PageConfig)
515
+ expect(extraction_config.pages.extract_pages).to be true
516
+ end
517
+
518
+ it 'extraction config to_h includes pages' do
519
+ pages_config = Kreuzberg::Config::PageConfig.new(
520
+ extract_pages: true,
521
+ insert_page_markers: true,
522
+ marker_format: 'CUSTOM_{page_num}'
523
+ )
524
+ extraction_config = Kreuzberg::Config::Extraction.new(pages: pages_config)
525
+
526
+ hash = extraction_config.to_h
527
+
528
+ expect(hash).to include(:pages)
529
+ expect(hash[:pages]).to be_a(Hash)
530
+ expect(hash[:pages][:extract_pages]).to be true
531
+ expect(hash[:pages][:insert_page_markers]).to be true
532
+ expect(hash[:pages][:marker_format]).to eq('CUSTOM_{page_num}')
533
+ end
534
+
535
+ it 'accepts pages config as hash in extraction config' do
536
+ extraction_config = Kreuzberg::Config::Extraction.new(
537
+ pages: {
538
+ extract_pages: true,
539
+ insert_page_markers: true,
540
+ marker_format: 'HASH_{page_num}'
541
+ }
542
+ )
543
+
544
+ expect(extraction_config.pages).to be_a(Kreuzberg::Config::PageConfig)
545
+ expect(extraction_config.pages.extract_pages).to be true
546
+ expect(extraction_config.pages.insert_page_markers).to be true
547
+ expect(extraction_config.pages.marker_format).to eq('HASH_{page_num}')
548
+ end
549
+ end
550
+ end