kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
@@ -1154,64 +1154,11 @@ RSpec.describe 'Kreuzberg Metadata Types' do
1154
1154
 
1155
1155
  describe 'Thread Safety: Concurrent Extraction' do
1156
1156
  it 'handles concurrent extraction safely' do
1157
- test_files = []
1158
- results = []
1159
- errors = []
1157
+ test_files = create_concurrent_test_files
1158
+ results, errors = run_concurrent_extractions(test_files)
1160
1159
 
1161
- 5.times do |i|
1162
- html_content = <<~HTML
1163
- <html>
1164
- <head>
1165
- <title>Concurrent Test #{i}</title>
1166
- <meta name="description" content="Test document #{i}">
1167
- <meta name="keywords" content="test#{i}, concurrent, thread-safe">
1168
- </head>
1169
- <body>
1170
- <h1>Test Document #{i}</h1>
1171
- <p>Content for test #{i}</p>
1172
- <a href="/page-#{i}">Link #{i}</a>
1173
- <img src="image-#{i}.jpg" alt="Image #{i}">
1174
- </body>
1175
- </html>
1176
- HTML
1177
- test_files << create_test_html_file(html_content)
1178
- end
1179
-
1180
- begin
1181
- threads = test_files.map do |file|
1182
- Thread.new do
1183
- result = Kreuzberg.extract_file_sync(path: file)
1184
- results << result
1185
- rescue StandardError => e
1186
- errors << e
1187
- end
1188
- end
1189
-
1190
- threads.each(&:join)
1191
-
1192
- expect(errors).to be_empty
1193
-
1194
- expect(results.length).to eq(5)
1195
- results.each do |result|
1196
- expect(result).to be_a(Kreuzberg::Result)
1197
- expect(result.metadata).not_to be_nil
1198
-
1199
- metadata = result.metadata
1200
- next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
1201
-
1202
- expect(metadata.title).not_to be_nil
1203
- expect(metadata.description).not_to be_nil
1204
- expect(metadata.keywords).to be_a(Array)
1205
- expect(metadata.headers).to be_a(Array)
1206
- expect(metadata.links).to be_a(Array)
1207
- expect(metadata.images).to be_a(Array)
1208
- end
1209
-
1210
- titles = results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
1211
- expect(titles.uniq.length).to eq(5)
1212
- ensure
1213
- test_files.each { |f| FileUtils.rm_f(f) }
1214
- end
1160
+ expect(results).not_to be_empty
1161
+ verify_concurrent_results(results, errors, test_files)
1215
1162
  end
1216
1163
  end
1217
1164
 
@@ -1225,4 +1172,77 @@ RSpec.describe 'Kreuzberg Metadata Types' do
1225
1172
  file.close
1226
1173
  file.path
1227
1174
  end
1175
+
1176
+ def create_concurrent_test_files
1177
+ test_files = []
1178
+ 5.times do |i|
1179
+ html_content = <<~HTML
1180
+ <html>
1181
+ <head>
1182
+ <title>Concurrent Test #{i}</title>
1183
+ <meta name="description" content="Test document #{i}">
1184
+ <meta name="keywords" content="test#{i}, concurrent, thread-safe">
1185
+ </head>
1186
+ <body>
1187
+ <h1>Test Document #{i}</h1>
1188
+ <p>Content for test #{i}</p>
1189
+ <a href="/page-#{i}">Link #{i}</a>
1190
+ <img src="image-#{i}.jpg" alt="Image #{i}">
1191
+ </body>
1192
+ </html>
1193
+ HTML
1194
+ test_files << create_test_html_file(html_content)
1195
+ end
1196
+ test_files
1197
+ end
1198
+
1199
+ def run_concurrent_extractions(test_files)
1200
+ results = []
1201
+ errors = []
1202
+
1203
+ threads = test_files.map do |file|
1204
+ Thread.new do
1205
+ result = Kreuzberg.extract_file_sync(path: file)
1206
+ results << result
1207
+ rescue StandardError => e
1208
+ errors << e
1209
+ end
1210
+ end
1211
+
1212
+ threads.each(&:join)
1213
+ [results, errors]
1214
+ end
1215
+
1216
+ def verify_concurrent_results(results, errors, test_files)
1217
+ expect(errors).to be_empty
1218
+ expect(results.length).to eq(5)
1219
+
1220
+ results.each do |result|
1221
+ expect(result).to be_a(Kreuzberg::Result)
1222
+ expect(result.metadata).not_to be_nil
1223
+
1224
+ metadata = result.metadata
1225
+ next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
1226
+
1227
+ verify_metadata_fields(metadata)
1228
+ end
1229
+
1230
+ titles = extract_titles(results)
1231
+ expect(titles.uniq.length).to eq(5)
1232
+ ensure
1233
+ test_files.each { |f| FileUtils.rm_f(f) }
1234
+ end
1235
+
1236
+ def verify_metadata_fields(metadata)
1237
+ expect(metadata.title).not_to be_nil
1238
+ expect(metadata.description).not_to be_nil
1239
+ expect(metadata.keywords).to be_a(Array)
1240
+ expect(metadata.headers).to be_a(Array)
1241
+ expect(metadata.links).to be_a(Array)
1242
+ expect(metadata.images).to be_a(Array)
1243
+ end
1244
+
1245
+ def extract_titles(results)
1246
+ results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
1247
+ end
1228
1248
  end
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'spec_helper'
4
+ require 'tempfile'
5
+ require 'fileutils'
4
6
 
5
7
  RSpec.describe 'Table Extraction Quality' do
6
8
  describe 'table structure extraction' do
@@ -523,12 +525,19 @@ RSpec.describe 'Table Extraction Quality' do
523
525
  it 'handles documents with no tables gracefully' do
524
526
  config = Kreuzberg::Config::Extraction.new
525
527
 
528
+ # Create a temporary text file for this test
529
+ file = Tempfile.new(['no_tables_test', '.txt'])
530
+ file.write('This is a text document without any tables.')
531
+ file.close
532
+
526
533
  begin
527
- result = Kreuzberg.extract_file(path: 'test.txt', config: config)
534
+ result = Kreuzberg.extract_file(path: file.path, config: config)
528
535
  expect(result).not_to be_nil
529
536
  expect(result.tables).to be_a(Array) if result.tables
530
- rescue Kreuzberg::Errors::ValidationError
537
+ rescue Kreuzberg::Errors::IOError
531
538
  skip 'Text file not available for testing'
539
+ ensure
540
+ FileUtils.rm_f(file.path)
532
541
  end
533
542
  end
534
543
 
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Cross-language serialization tests for Ruby bindings
4
+ #
5
+ # Validates that ExtractionConfig serializes consistently with other language bindings
6
+
7
+ require 'json'
8
+ require 'spec_helper'
9
+
10
+ RSpec.describe Kreuzberg::ExtractionConfig do
11
+ describe '#to_h' do
12
+ it 'serializes minimal config to hash' do
13
+ config = described_class.new
14
+ hash = config.to_h
15
+
16
+ expect(hash).to be_a(Hash)
17
+ expect(hash).to have_key(:use_cache)
18
+ expect(hash).to have_key(:enable_quality_processing)
19
+ expect(hash).to have_key(:force_ocr)
20
+ end
21
+
22
+ it 'serializes config with all fields' do
23
+ config = described_class.new(
24
+ use_cache: true,
25
+ enable_quality_processing: true,
26
+ force_ocr: false
27
+ )
28
+
29
+ hash = config.to_h
30
+
31
+ expect(hash[:use_cache]).to be(true)
32
+ expect(hash[:enable_quality_processing]).to be(true)
33
+ expect(hash[:force_ocr]).to be(false)
34
+ end
35
+
36
+ it 'preserves field values after serialization' do
37
+ original = described_class.new(
38
+ use_cache: false,
39
+ enable_quality_processing: true
40
+ )
41
+
42
+ hash = original.to_h
43
+
44
+ expect(hash[:use_cache]).to be(false)
45
+ expect(hash[:enable_quality_processing]).to be(true)
46
+ end
47
+ end
48
+
49
+ describe '#to_json' do
50
+ it 'serializes to JSON' do
51
+ config = described_class.new(use_cache: true)
52
+ json = config.to_json
53
+
54
+ expect(json).to be_a(String)
55
+
56
+ parsed = JSON.parse(json, symbolize_names: true)
57
+ expect(parsed).to have_key(:use_cache)
58
+ expect(parsed[:use_cache]).to be(true)
59
+ end
60
+
61
+ it 'produces valid JSON' do
62
+ config = described_class.new
63
+ json = config.to_json
64
+
65
+ expect { JSON.parse(json) }.not_to raise_error
66
+ end
67
+
68
+ it 'uses snake_case field names' do
69
+ config = described_class.new(use_cache: true)
70
+ json = config.to_json
71
+
72
+ expect(json).to include('use_cache')
73
+ expect(json).not_to include('useCache')
74
+ end
75
+ end
76
+
77
+ describe 'round-trip serialization' do
78
+ it 'survives serialization -> deserialization -> serialization' do
79
+ config1 = described_class.new(
80
+ use_cache: true,
81
+ enable_quality_processing: false
82
+ )
83
+
84
+ json1 = config1.to_json
85
+ hash1 = JSON.parse(json1, symbolize_names: true)
86
+
87
+ config2 = described_class.new(hash1)
88
+ json2 = config2.to_json
89
+
90
+ # JSON strings should be equivalent
91
+ expect(JSON.parse(json1)).to eq(JSON.parse(json2))
92
+ end
93
+ end
94
+
95
+ describe 'field consistency' do
96
+ it 'includes all mandatory fields' do
97
+ config = described_class.new
98
+ hash = config.to_h
99
+
100
+ mandatory_fields = %i[use_cache enable_quality_processing force_ocr]
101
+ mandatory_fields.each do |field|
102
+ expect(hash).to have_key(field)
103
+ end
104
+ end
105
+
106
+ it 'handles nested ocr config' do
107
+ config = described_class.new(
108
+ ocr: {
109
+ backend: 'tesseract',
110
+ language: 'eng'
111
+ }
112
+ )
113
+
114
+ hash = config.to_h
115
+
116
+ expect(hash).to have_key(:ocr)
117
+ expect(hash[:ocr][:backend]).to eq('tesseract')
118
+ expect(hash[:ocr][:language]).to eq('eng')
119
+ end
120
+ end
121
+
122
+ describe 'immutability' do
123
+ it 'does not modify original config during serialization' do
124
+ config = described_class.new(use_cache: true)
125
+
126
+ json1 = config.to_json
127
+ json2 = config.to_json
128
+ json3 = config.to_json
129
+
130
+ expect(json1).to eq(json2)
131
+ expect(json2).to eq(json3)
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,380 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable RSpec/RepeatedExample
4
+ RSpec.describe 'Output Format and Result Format Configuration' do
5
+ describe Kreuzberg::Config::Extraction do
6
+ describe 'output_format' do
7
+ it 'accepts output_format as initialization parameter' do
8
+ config = described_class.new(output_format: 'markdown')
9
+
10
+ expect(config.output_format).to eq 'markdown'
11
+ end
12
+
13
+ it 'defaults to nil when not specified' do
14
+ config = described_class.new
15
+
16
+ expect(config.output_format).to be_nil
17
+ end
18
+
19
+ it 'accepts plain format' do
20
+ config = described_class.new(output_format: 'plain')
21
+
22
+ expect(config.output_format).to eq 'plain'
23
+ end
24
+
25
+ it 'accepts markdown format' do
26
+ config = described_class.new(output_format: 'markdown')
27
+
28
+ expect(config.output_format).to eq 'markdown'
29
+ end
30
+
31
+ it 'accepts djot format' do
32
+ config = described_class.new(output_format: 'djot')
33
+
34
+ expect(config.output_format).to eq 'djot'
35
+ end
36
+
37
+ it 'accepts html format' do
38
+ config = described_class.new(output_format: 'html')
39
+
40
+ expect(config.output_format).to eq 'html'
41
+ end
42
+
43
+ it 'converts output_format to string' do
44
+ config = described_class.new(output_format: :markdown)
45
+
46
+ expect(config.output_format).to eq 'markdown'
47
+ expect(config.output_format).to be_a String
48
+ end
49
+
50
+ it 'includes output_format in to_h' do
51
+ config = described_class.new(output_format: 'markdown')
52
+ hash = config.to_h
53
+
54
+ expect(hash[:output_format]).to eq 'markdown'
55
+ end
56
+
57
+ it 'excludes nil output_format from to_h' do
58
+ config = described_class.new(output_format: nil)
59
+ hash = config.to_h
60
+
61
+ expect(hash.key?(:output_format)).to be false
62
+ end
63
+
64
+ it 'includes output_format in JSON' do
65
+ config = described_class.new(output_format: 'markdown')
66
+ json = config.to_json
67
+ parsed = JSON.parse(json)
68
+
69
+ expect(parsed['output_format']).to eq 'markdown'
70
+ end
71
+
72
+ it 'retrieves output_format with get_field' do
73
+ config = described_class.new(output_format: 'djot')
74
+
75
+ expect(config.get_field('output_format')).to eq 'djot'
76
+ end
77
+
78
+ it 'can be set with []=' do
79
+ config = described_class.new
80
+ config[:output_format] = 'html'
81
+
82
+ expect(config.output_format).to eq 'html'
83
+ end
84
+
85
+ it 'can be set with []= using symbol' do
86
+ config = described_class.new
87
+ config[:output_format] = :plain
88
+
89
+ expect(config.output_format).to eq 'plain'
90
+ end
91
+
92
+ it 'can be retrieved with []' do
93
+ config = described_class.new(output_format: 'markdown')
94
+
95
+ expect(config[:output_format]).to eq 'markdown'
96
+ end
97
+ end
98
+
99
+ describe 'result_format' do
100
+ it 'accepts result_format as initialization parameter' do
101
+ config = described_class.new(result_format: 'unified')
102
+
103
+ expect(config.result_format).to eq 'unified'
104
+ end
105
+
106
+ it 'defaults to nil when not specified' do
107
+ config = described_class.new
108
+
109
+ expect(config.result_format).to be_nil
110
+ end
111
+
112
+ it 'accepts unified format' do
113
+ config = described_class.new(result_format: 'unified')
114
+
115
+ expect(config.result_format).to eq 'unified'
116
+ end
117
+
118
+ it 'accepts element_based format' do
119
+ config = described_class.new(result_format: 'element_based')
120
+
121
+ expect(config.result_format).to eq 'element_based'
122
+ end
123
+
124
+ it 'converts result_format to string' do
125
+ config = described_class.new(result_format: :unified)
126
+
127
+ expect(config.result_format).to eq 'unified'
128
+ expect(config.result_format).to be_a String
129
+ end
130
+
131
+ it 'includes result_format in to_h' do
132
+ config = described_class.new(result_format: 'element_based')
133
+ hash = config.to_h
134
+
135
+ expect(hash[:result_format]).to eq 'element_based'
136
+ end
137
+
138
+ it 'excludes nil result_format from to_h' do
139
+ config = described_class.new(result_format: nil)
140
+ hash = config.to_h
141
+
142
+ expect(hash.key?(:result_format)).to be false
143
+ end
144
+
145
+ it 'includes result_format in JSON' do
146
+ config = described_class.new(result_format: 'element_based')
147
+ json = config.to_json
148
+ parsed = JSON.parse(json)
149
+
150
+ expect(parsed['result_format']).to eq 'element_based'
151
+ end
152
+
153
+ it 'retrieves result_format with get_field' do
154
+ config = described_class.new(result_format: 'unified')
155
+
156
+ expect(config.get_field('result_format')).to eq 'unified'
157
+ end
158
+
159
+ it 'can be set with []=' do
160
+ config = described_class.new
161
+ config[:result_format] = 'unified'
162
+
163
+ expect(config.result_format).to eq 'unified'
164
+ end
165
+
166
+ it 'can be set with []= using symbol' do
167
+ config = described_class.new
168
+ config[:result_format] = :element_based
169
+
170
+ expect(config.result_format).to eq 'element_based'
171
+ end
172
+
173
+ it 'can be retrieved with []' do
174
+ config = described_class.new(result_format: 'element_based')
175
+
176
+ expect(config[:result_format]).to eq 'element_based'
177
+ end
178
+ end
179
+
180
+ describe 'combined output and result formats' do
181
+ it 'accepts both output_format and result_format' do
182
+ config = described_class.new(
183
+ output_format: 'markdown',
184
+ result_format: 'unified'
185
+ )
186
+
187
+ expect(config.output_format).to eq 'markdown'
188
+ expect(config.result_format).to eq 'unified'
189
+ end
190
+
191
+ it 'serializes both formats in to_h' do
192
+ config = described_class.new(
193
+ output_format: 'djot',
194
+ result_format: 'element_based'
195
+ )
196
+ hash = config.to_h
197
+
198
+ expect(hash[:output_format]).to eq 'djot'
199
+ expect(hash[:result_format]).to eq 'element_based'
200
+ end
201
+
202
+ it 'serializes both formats in JSON' do
203
+ config = described_class.new(
204
+ output_format: 'html',
205
+ result_format: 'unified'
206
+ )
207
+ json = config.to_json
208
+ parsed = JSON.parse(json)
209
+
210
+ expect(parsed['output_format']).to eq 'html'
211
+ expect(parsed['result_format']).to eq 'unified'
212
+ end
213
+
214
+ it 'merges both formats correctly' do
215
+ base = described_class.new(
216
+ output_format: 'markdown',
217
+ result_format: 'unified'
218
+ )
219
+ override = described_class.new(output_format: 'html')
220
+ merged = base.merge(override)
221
+
222
+ expect(merged.output_format).to eq 'html'
223
+ expect(merged.result_format).to eq 'unified'
224
+ end
225
+
226
+ it 'merges both formats with merge!' do
227
+ config = described_class.new(
228
+ output_format: 'markdown',
229
+ result_format: 'unified'
230
+ )
231
+ override = described_class.new(
232
+ output_format: 'djot',
233
+ result_format: 'element_based'
234
+ )
235
+ config.merge!(override)
236
+
237
+ expect(config.output_format).to eq 'djot'
238
+ expect(config.result_format).to eq 'element_based'
239
+ end
240
+
241
+ it 'handles merge with hash containing both formats' do
242
+ config = described_class.new(
243
+ output_format: 'plain',
244
+ result_format: 'unified'
245
+ )
246
+ merged = config.merge({ output_format: 'markdown' })
247
+
248
+ expect(merged.output_format).to eq 'markdown'
249
+ expect(merged.result_format).to eq 'unified'
250
+ end
251
+ end
252
+
253
+ describe 'format persistence across operations' do
254
+ it 'persists output_format through multiple conversions' do
255
+ config = described_class.new(output_format: 'markdown')
256
+ hash = config.to_h
257
+ new_config = described_class.new(**hash)
258
+
259
+ expect(new_config.output_format).to eq 'markdown'
260
+ end
261
+
262
+ it 'persists result_format through multiple conversions' do
263
+ config = described_class.new(result_format: 'element_based')
264
+ hash = config.to_h
265
+ new_config = described_class.new(**hash)
266
+
267
+ expect(new_config.result_format).to eq 'element_based'
268
+ end
269
+
270
+ it 'round-trips through JSON' do
271
+ config = described_class.new(
272
+ output_format: 'djot',
273
+ result_format: 'unified'
274
+ )
275
+ json = config.to_json
276
+ parsed = JSON.parse(json)
277
+ new_config = described_class.new(**parsed.transform_keys(&:to_sym))
278
+
279
+ expect(new_config.output_format).to eq 'djot'
280
+ expect(new_config.result_format).to eq 'unified'
281
+ end
282
+ end
283
+
284
+ describe 'format validation and edge cases' do
285
+ it 'raises error for empty string output_format' do
286
+ expect do
287
+ described_class.new(output_format: '')
288
+ end.to raise_error(ArgumentError, /Invalid output_format/)
289
+ end
290
+
291
+ it 'raises error for empty string result_format' do
292
+ expect do
293
+ described_class.new(result_format: '')
294
+ end.to raise_error(ArgumentError, /Invalid result_format/)
295
+ end
296
+
297
+ it 'raises error for whitespace in output_format' do
298
+ expect do
299
+ described_class.new(output_format: ' plain ')
300
+ end.to raise_error(ArgumentError, /Invalid output_format/)
301
+ end
302
+
303
+ it 'normalizes case in output_format' do
304
+ config = described_class.new(output_format: 'MarkDown')
305
+
306
+ expect(config.output_format).to eq 'markdown'
307
+ end
308
+
309
+ it 'raises error for custom string in result_format' do
310
+ expect do
311
+ described_class.new(result_format: 'custom_format')
312
+ end.to raise_error(ArgumentError, /Invalid result_format/)
313
+ end
314
+ end
315
+
316
+ describe 'integration with other config fields' do
317
+ it 'works with output_format and chunking together' do
318
+ config = described_class.new(
319
+ output_format: 'markdown',
320
+ chunking: { max_chars: 500 }
321
+ )
322
+
323
+ expect(config.output_format).to eq 'markdown'
324
+ expect(config.chunking.max_chars).to eq 500
325
+ end
326
+
327
+ it 'works with result_format and OCR together' do
328
+ config = described_class.new(
329
+ result_format: 'element_based',
330
+ ocr: { backend: 'tesseract' }
331
+ )
332
+
333
+ expect(config.result_format).to eq 'element_based'
334
+ expect(config.ocr.backend).to eq 'tesseract'
335
+ end
336
+
337
+ it 'works with both formats and language detection' do
338
+ config = described_class.new(
339
+ output_format: 'html',
340
+ result_format: 'unified',
341
+ language_detection: { enabled: true }
342
+ )
343
+
344
+ expect(config.output_format).to eq 'html'
345
+ expect(config.result_format).to eq 'unified'
346
+ expect(config.language_detection.enabled).to be true
347
+ end
348
+
349
+ it 'preserves formats in complex config merge' do
350
+ base = described_class.new(
351
+ output_format: 'markdown',
352
+ result_format: 'unified',
353
+ chunking: { max_chars: 500 },
354
+ ocr: { backend: 'tesseract' }
355
+ )
356
+ override = described_class.new(
357
+ output_format: 'djot',
358
+ chunking: { max_chars: 750 }
359
+ )
360
+ merged = base.merge(override)
361
+
362
+ expect(merged.output_format).to eq 'djot'
363
+ expect(merged.result_format).to eq 'unified'
364
+ expect(merged.chunking.max_chars).to eq 750
365
+ expect(merged.ocr.backend).to eq 'tesseract'
366
+ end
367
+ end
368
+
369
+ describe 'allowed keys integration' do
370
+ it 'includes output_format in ALLOWED_KEYS' do
371
+ expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:output_format)
372
+ end
373
+
374
+ it 'includes result_format in ALLOWED_KEYS' do
375
+ expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:result_format)
376
+ end
377
+ end
378
+ end
379
+ end
380
+ # rubocop:enable RSpec/RepeatedExample
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.1.2"
6
+ version = "4.2.1"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.1.2"
3
+ version = "4.2.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]