kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,650 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'tempfile'
5
+ require 'fileutils'
6
+
7
+ RSpec.describe 'Table Extraction Quality' do
8
+ describe 'table structure extraction' do
9
+ let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
10
+
11
+ it 'extracts table rows, columns, and headers' do
12
+ config = Kreuzberg::Config::Extraction.new
13
+
14
+ begin
15
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
16
+ rescue Kreuzberg::Errors::ValidationError
17
+ skip 'Test PDF file not available'
18
+ end
19
+
20
+ expect(result).not_to be_nil
21
+ expect(result.tables).not_to be_nil
22
+ unless result.tables.empty?
23
+ table = result.tables.first
24
+ expect(table).to be_a(Kreuzberg::Result::Table)
25
+ expect(table.cells).not_to be_nil
26
+ expect(table.cells).to be_a(Array)
27
+ end
28
+ end
29
+
30
+ it 'returns cell arrays with consistent structure' do
31
+ config = Kreuzberg::Config::Extraction.new
32
+
33
+ begin
34
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
35
+ rescue Kreuzberg::Errors::ValidationError
36
+ skip 'Test PDF file not available'
37
+ end
38
+
39
+ if result.tables && !result.tables.empty?
40
+ expect(result.tables).to all(
41
+ be_a(Kreuzberg::Result::Table).and(
42
+ have_attributes(cells: be_a(Array))
43
+ )
44
+ )
45
+ end
46
+ end
47
+
48
+ it 'provides page number for each table' do
49
+ config = Kreuzberg::Config::Extraction.new
50
+
51
+ begin
52
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
53
+ rescue Kreuzberg::Errors::ValidationError
54
+ skip 'Test PDF file not available'
55
+ end
56
+
57
+ if result.tables && !result.tables.empty?
58
+ result.tables.each do |table|
59
+ expect(table.page_number).not_to be_nil
60
+ expect(table.page_number).to be_a(Integer)
61
+ expect(table.page_number).to be > 0
62
+ end
63
+ end
64
+ end
65
+
66
+ it 'detects proper row and column counts' do
67
+ config = Kreuzberg::Config::Extraction.new
68
+
69
+ begin
70
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
71
+ rescue Kreuzberg::Errors::ValidationError
72
+ skip 'Test PDF file not available'
73
+ end
74
+
75
+ if result.tables && !result.tables.empty?
76
+ table = result.tables.first
77
+ unless table.cells.empty?
78
+ first_row_cols = table.cells.first.length
79
+ expect(first_row_cols).to be > 0
80
+ expect(first_row_cols).to be_a(Integer)
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ describe 'table markdown conversion accuracy' do
87
+ let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
88
+
89
+ it 'generates markdown representation for tables' do
90
+ config = Kreuzberg::Config::Extraction.new
91
+
92
+ begin
93
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
94
+ rescue Kreuzberg::Errors::ValidationError
95
+ skip 'Test PDF file not available'
96
+ end
97
+
98
+ if result.tables && !result.tables.empty?
99
+ result.tables.each do |table|
100
+ expect(table.markdown).not_to be_nil
101
+ expect(table.markdown).to be_a(String)
102
+ # If table has cells, markdown must not be empty
103
+ if table.cells && !table.cells.empty?
104
+ expect(table.markdown).not_to be_empty, 'Markdown must not be empty when table has cells'
105
+ end
106
+ end
107
+ end
108
+ end
109
+
110
+ it 'markdown contains pipe delimiters for table structure' do
111
+ config = Kreuzberg::Config::Extraction.new
112
+
113
+ begin
114
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
115
+ rescue Kreuzberg::Errors::ValidationError
116
+ skip 'Test PDF file not available'
117
+ end
118
+
119
+ if result.tables && !result.tables.empty?
120
+ result.tables.each do |table|
121
+ # If table has cells and markdown, it must contain pipes
122
+ if table.cells && !table.cells.empty? && !table.markdown.empty?
123
+ expect(table.markdown).to include('|'), 'Markdown table must include pipe separators for cells'
124
+ end
125
+ end
126
+ end
127
+ end
128
+
129
+ it 'markdown format is consistent with cell data' do
130
+ config = Kreuzberg::Config::Extraction.new
131
+
132
+ begin
133
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
134
+ rescue Kreuzberg::Errors::ValidationError
135
+ skip 'Test PDF file not available'
136
+ end
137
+
138
+ if result.tables && !result.tables.empty?
139
+ table = result.tables.first
140
+ unless table.cells.empty?
141
+ row_count = table.cells.length
142
+ expect(row_count).to be > 0
143
+ expect(row_count).to be_a(Integer)
144
+ end
145
+ end
146
+ end
147
+ end
148
+
149
+ describe 'cell content preservation' do
150
+ let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
151
+
152
+ it 'preserves text content in cells accurately' do
153
+ config = Kreuzberg::Config::Extraction.new
154
+
155
+ begin
156
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
157
+ rescue Kreuzberg::Errors::ValidationError
158
+ skip 'Test PDF file not available'
159
+ end
160
+
161
+ if result.tables && !result.tables.empty?
162
+ result.tables.each do |table|
163
+ table.cells.each do |row|
164
+ row.each do |cell|
165
+ expect(cell).to be_a(String)
166
+ expect(cell).not_to be_nil
167
+ end
168
+ end
169
+ end
170
+ end
171
+ end
172
+
173
+ it 'handles cells with numeric content' do
174
+ config = Kreuzberg::Config::Extraction.new
175
+
176
+ begin
177
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
178
+ rescue Kreuzberg::Errors::ValidationError
179
+ skip 'Test PDF file not available'
180
+ end
181
+
182
+ if result.tables && !result.tables.empty?
183
+ result.tables.each do |table|
184
+ table.cells.each do |row|
185
+ row.each do |cell|
186
+ expect(cell).not_to be_nil
187
+ end
188
+ end
189
+ end
190
+ end
191
+ end
192
+
193
+ it 'preserves whitespace and formatting in cells' do
194
+ config = Kreuzberg::Config::Extraction.new
195
+
196
+ begin
197
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
198
+ rescue Kreuzberg::Errors::ValidationError
199
+ skip 'Test PDF file not available'
200
+ end
201
+
202
+ if result.tables && !result.tables.empty?
203
+ result.tables.each do |table|
204
+ expect(table.cells).not_to be_empty
205
+ expect(table.cells).to all(all(be_a(String)))
206
+ end
207
+ end
208
+ end
209
+
210
+ it 'handles empty cells correctly' do
211
+ config = Kreuzberg::Config::Extraction.new
212
+
213
+ begin
214
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
215
+ rescue Kreuzberg::Errors::ValidationError
216
+ skip 'Test PDF file not available'
217
+ end
218
+
219
+ if result.tables && !result.tables.empty?
220
+ result.tables.each do |table|
221
+ expect(table.cells).to be_a(Array)
222
+ expect(table.cells).to all(all(be_a(String)))
223
+ end
224
+ end
225
+ end
226
+ end
227
+
228
+ describe 'format-specific table handling' do
229
+ let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
230
+
231
+ it 'extracts tables from PDF documents' do
232
+ config = Kreuzberg::Config::Extraction.new
233
+
234
+ begin
235
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
236
+ rescue Kreuzberg::Errors::ValidationError
237
+ skip 'Test PDF file not available'
238
+ end
239
+
240
+ expect(result).not_to be_nil
241
+ expect(result.tables).not_to be_nil
242
+ expect(result.tables).to be_a(Array)
243
+ end
244
+
245
+ it 'extracts tables from Office formats' do
246
+ config = Kreuzberg::Config::Extraction.new
247
+
248
+ begin
249
+ result = Kreuzberg.extract_file(path: test_document_path('docx/extraction_test.docx'), config: config)
250
+ expect(result).not_to be_nil
251
+ rescue Kreuzberg::Errors::ValidationError
252
+ skip 'DOCX test file not available'
253
+ end
254
+ end
255
+
256
+ it 'handles PDF tables with different layouts' do
257
+ config = Kreuzberg::Config::Extraction.new
258
+
259
+ begin
260
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
261
+ rescue Kreuzberg::Errors::ValidationError
262
+ skip 'Test PDF file not available'
263
+ end
264
+
265
+ if result.tables && !result.tables.empty?
266
+ result.tables.each do |table|
267
+ expect(table.cells).not_to be_nil
268
+ expect(table.markdown).not_to be_nil
269
+ end
270
+ end
271
+ end
272
+
273
+ it 'respects extraction configuration for tables' do
274
+ config = Kreuzberg::Config::Extraction.new
275
+
276
+ begin
277
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
278
+ rescue Kreuzberg::Errors::ValidationError
279
+ skip 'Test PDF file not available'
280
+ end
281
+
282
+ expect(result).not_to be_nil
283
+ expect(result.tables).not_to be_nil
284
+ end
285
+ end
286
+
287
+ describe 'table boundary detection' do
288
+ let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
289
+
290
+ it 'correctly identifies table boundaries' do
291
+ config = Kreuzberg::Config::Extraction.new
292
+
293
+ begin
294
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
295
+ rescue Kreuzberg::Errors::ValidationError
296
+ skip 'Test PDF file not available'
297
+ end
298
+
299
+ if result.tables && !result.tables.empty?
300
+ result.tables.each do |table|
301
+ expect(table.cells.length).to be > 0
302
+ table.cells.each do |row|
303
+ expect(row.length).to be > 0
304
+ end
305
+ end
306
+ end
307
+ end
308
+
309
+ it 'separates adjacent tables correctly' do
310
+ config = Kreuzberg::Config::Extraction.new
311
+
312
+ begin
313
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
314
+ rescue Kreuzberg::Errors::ValidationError
315
+ skip 'Test PDF file not available'
316
+ end
317
+
318
+ if result.tables && result.tables.length > 1
319
+ table_count = result.tables.length
320
+ expect(table_count).to be > 1
321
+ result.tables.each do |table|
322
+ expect(table.cells).not_to be_nil
323
+ expect(table.cells.length).to be > 0
324
+ end
325
+ end
326
+ end
327
+
328
+ it 'maintains consistent column alignment across rows' do
329
+ config = Kreuzberg::Config::Extraction.new
330
+
331
+ begin
332
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
333
+ rescue Kreuzberg::Errors::ValidationError
334
+ skip 'Test PDF file not available'
335
+ end
336
+
337
+ if result.tables && !result.tables.empty?
338
+ table = result.tables.first
339
+ if table.cells.length > 1
340
+ first_row_cols = table.cells.first.length
341
+ table.cells.each do |row|
342
+ expect(row.length).to eq(first_row_cols)
343
+ end
344
+ end
345
+ end
346
+ end
347
+ end
348
+
349
+ describe 'performance with large tables' do
350
+ let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
351
+
352
+ it 'extracts large tables with 100+ rows efficiently' do
353
+ config = Kreuzberg::Config::Extraction.new
354
+
355
+ begin
356
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
357
+ rescue Kreuzberg::Errors::ValidationError
358
+ skip 'Test PDF file not available'
359
+ end
360
+
361
+ expect(result).not_to be_nil
362
+ expect(result.tables).to be_a(Array)
363
+ end
364
+
365
+ it 'maintains data integrity for large tables' do
366
+ config = Kreuzberg::Config::Extraction.new
367
+
368
+ begin
369
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
370
+ rescue Kreuzberg::Errors::ValidationError
371
+ skip 'Test PDF file not available'
372
+ end
373
+
374
+ if result.tables && !result.tables.empty?
375
+ result.tables.each do |table|
376
+ expect(table.cells).not_to be_nil
377
+ expect(table.cells).to all(all(be_a(String)))
378
+ end
379
+ end
380
+ end
381
+
382
+ it 'handles tables with varying column counts' do
383
+ config = Kreuzberg::Config::Extraction.new
384
+
385
+ begin
386
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
387
+ rescue Kreuzberg::Errors::ValidationError
388
+ skip 'Test PDF file not available'
389
+ end
390
+
391
+ if result.tables && !result.tables.empty?
392
+ result.tables.each do |table|
393
+ expect(table.cells.length).to be >= 0
394
+ end
395
+ end
396
+ end
397
+ end
398
+
399
+ describe 'table serialization and conversion' do
400
+ let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
401
+
402
+ it 'serializes table to hash correctly' do
403
+ config = Kreuzberg::Config::Extraction.new
404
+
405
+ begin
406
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
407
+ rescue Kreuzberg::Errors::ValidationError
408
+ skip 'Test PDF file not available'
409
+ end
410
+
411
+ if result.tables && !result.tables.empty?
412
+ table = result.tables.first
413
+ table_hash = table.to_h
414
+
415
+ expect(table_hash).to be_a(Hash)
416
+ expect(table_hash).to have_key(:cells)
417
+ expect(table_hash).to have_key(:markdown)
418
+ expect(table_hash).to have_key(:page_number)
419
+ end
420
+ end
421
+
422
+ it 'preserves table data through serialization' do
423
+ config = Kreuzberg::Config::Extraction.new
424
+
425
+ begin
426
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
427
+ rescue Kreuzberg::Errors::ValidationError
428
+ skip 'Test PDF file not available'
429
+ end
430
+
431
+ if result.tables && !result.tables.empty?
432
+ table = result.tables.first
433
+ table_hash = table.to_h
434
+
435
+ expect(table_hash[:cells]).to eq(table.cells)
436
+ expect(table_hash[:markdown]).to eq(table.markdown)
437
+ expect(table_hash[:page_number]).to eq(table.page_number)
438
+ end
439
+ end
440
+
441
+ it 'converts result with tables to JSON' do
442
+ config = Kreuzberg::Config::Extraction.new
443
+
444
+ begin
445
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
446
+ rescue Kreuzberg::Errors::ValidationError
447
+ skip 'Test PDF file not available'
448
+ end
449
+
450
+ expect(result).not_to be_nil
451
+ json_str = result.to_json
452
+ expect(json_str).to be_a(String)
453
+ expect(json_str.length).to be > 0
454
+ end
455
+ end
456
+
457
+ describe 'table extraction with page context' do
458
+ let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
459
+
460
+ it 'associates tables with correct page numbers' do
461
+ config = Kreuzberg::Config::Extraction.new(
462
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
463
+ )
464
+
465
+ begin
466
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
467
+ rescue Kreuzberg::Errors::ValidationError
468
+ skip 'Test PDF file not available'
469
+ end
470
+
471
+ if result.tables && !result.tables.empty?
472
+ result.tables.each do |table|
473
+ expect(table.page_number).to be > 0
474
+ expect(table.page_number).to be <= result.page_count
475
+ end
476
+ end
477
+ end
478
+
479
+ it 'extracts tables from specific pages when available' do
480
+ config = Kreuzberg::Config::Extraction.new(
481
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
482
+ )
483
+
484
+ begin
485
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
486
+ rescue Kreuzberg::Errors::ValidationError
487
+ skip 'Test PDF file not available'
488
+ end
489
+
490
+ if result.pages && !result.pages.empty?
491
+ result.pages.each do |page|
492
+ expect(page.page_number).not_to be_nil
493
+ next unless page.tables
494
+
495
+ page.tables.each do |table|
496
+ expect(table.page_number).to eq(page.page_number)
497
+ end
498
+ end
499
+ end
500
+ end
501
+
502
+ it 'maintains table consistency across page and global results' do
503
+ config = Kreuzberg::Config::Extraction.new(
504
+ pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
505
+ )
506
+
507
+ begin
508
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
509
+ rescue Kreuzberg::Errors::ValidationError
510
+ skip 'Test PDF file not available'
511
+ end
512
+
513
+ if result.tables && !result.tables.empty? && result.pages && !result.pages.empty?
514
+ global_table_count = result.tables.length
515
+ page_table_count = result.pages.sum { |page| page.tables&.length || 0 }
516
+
517
+ expect(page_table_count).to eq(global_table_count)
518
+ end
519
+ end
520
+ end
521
+
522
+ describe 'table handling edge cases' do
523
+ let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
524
+
525
+ it 'handles documents with no tables gracefully' do
526
+ config = Kreuzberg::Config::Extraction.new
527
+
528
+ # Create a temporary text file for this test
529
+ file = Tempfile.new(['no_tables_test', '.txt'])
530
+ file.write('This is a text document without any tables.')
531
+ file.close
532
+
533
+ begin
534
+ result = Kreuzberg.extract_file(path: file.path, config: config)
535
+ expect(result).not_to be_nil
536
+ expect(result.tables).to be_a(Array) if result.tables
537
+ rescue Kreuzberg::Errors::IOError
538
+ skip 'Text file not available for testing'
539
+ ensure
540
+ FileUtils.rm_f(file.path)
541
+ end
542
+ end
543
+
544
+ it 'handles single-cell tables' do
545
+ config = Kreuzberg::Config::Extraction.new
546
+
547
+ begin
548
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
549
+ rescue Kreuzberg::Errors::ValidationError
550
+ skip 'Test PDF file not available'
551
+ end
552
+
553
+ if result.tables && !result.tables.empty?
554
+ result.tables.each do |table|
555
+ expect(table.cells).to be_a(Array)
556
+ end
557
+ end
558
+ end
559
+
560
+ it 'handles tables with long cell content' do
561
+ config = Kreuzberg::Config::Extraction.new
562
+
563
+ begin
564
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
565
+ rescue Kreuzberg::Errors::ValidationError
566
+ skip 'Test PDF file not available'
567
+ end
568
+
569
+ if result.tables && !result.tables.empty?
570
+ result.tables.each do |table|
571
+ table.cells.each do |row|
572
+ row.each do |cell|
573
+ expect(cell).to be_a(String)
574
+ expect(cell.length).to be >= 0
575
+ end
576
+ end
577
+ end
578
+ end
579
+ end
580
+
581
+ it 'handles tables with special characters' do
582
+ config = Kreuzberg::Config::Extraction.new
583
+
584
+ begin
585
+ result = Kreuzberg.extract_file(path: pdf_path, config: config)
586
+ rescue Kreuzberg::Errors::ValidationError
587
+ skip 'Test PDF file not available'
588
+ end
589
+
590
+ if result.tables && !result.tables.empty?
591
+ result.tables.each do |table|
592
+ expect(table.cells).to all(all(be_a(String)))
593
+ end
594
+ end
595
+ end
596
+ end
597
+
598
+ describe 'Table Struct validation' do
599
+ it 'creates Table struct with all fields' do
600
+ table = Kreuzberg::Result::Table.new(
601
+ cells: [%w[Header1 Header2], %w[Value1 Value2]],
602
+ markdown: '| Header1 | Header2 |\n|---------|--------|\n| Value1 | Value2 |',
603
+ page_number: 1
604
+ )
605
+
606
+ expect(table.cells).to eq([%w[Header1 Header2], %w[Value1 Value2]])
607
+ expect(table.markdown).to include('Header1')
608
+ expect(table.page_number).to eq(1)
609
+ end
610
+
611
+ it 'converts Table struct to hash' do
612
+ table = Kreuzberg::Result::Table.new(
613
+ cells: [%w[A B], %w[C D]],
614
+ markdown: '| A | B |\n|---|---|\n| C | D |',
615
+ page_number: 2
616
+ )
617
+
618
+ table_hash = table.to_h
619
+
620
+ expect(table_hash).to be_a(Hash)
621
+ expect(table_hash[:cells]).to eq([%w[A B], %w[C D]])
622
+ expect(table_hash[:markdown]).to include('A')
623
+ expect(table_hash[:page_number]).to eq(2)
624
+ end
625
+
626
+ it 'handles Table struct with empty cells' do
627
+ table = Kreuzberg::Result::Table.new(
628
+ cells: [],
629
+ markdown: '',
630
+ page_number: 1
631
+ )
632
+
633
+ expect(table.cells).to eq([])
634
+ expect(table.markdown).to eq('')
635
+ expect(table.page_number).to eq(1)
636
+ end
637
+
638
+ it 'handles Table struct with nil values' do
639
+ table = Kreuzberg::Result::Table.new(
640
+ cells: nil,
641
+ markdown: nil,
642
+ page_number: 0
643
+ )
644
+
645
+ expect(table.cells).to be_nil
646
+ expect(table.markdown).to be_nil
647
+ expect(table.page_number).to eq(0)
648
+ end
649
+ end
650
+ end
@@ -0,0 +1,38 @@
1
+
2
+ use_cache = false
3
+ enable_quality_processing = true
4
+ force_ocr = true
5
+
6
+ [ocr]
7
+ backend = "tesseract"
8
+ language = "deu"
9
+
10
+ [chunking]
11
+ max_chars = 500
12
+ max_overlap = 100
13
+ preset = "fast"
14
+
15
+ [language_detection]
16
+ enabled = true
17
+ min_confidence = 0.9
18
+
19
+ [pdf_options]
20
+ extract_images = true
21
+ passwords = ["secret", "backup"]
22
+ extract_metadata = true
23
+
24
+ [image_extraction]
25
+ extract_images = true
26
+ target_dpi = 600
27
+ max_image_dimension = 2000
28
+ auto_adjust_dpi = false
29
+ min_dpi = 150
30
+ max_dpi = 600
31
+
32
+ [postprocessor]
33
+ enabled = true
34
+ enabled_processors = ["quality", "formatting"]
35
+
36
+ [token_reduction]
37
+ mode = "moderate"
38
+ preserve_important_words = true