kreuzberg 4.1.2 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +121 -39
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +28 -12
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +57 -57
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +12 -2
|
@@ -1154,64 +1154,11 @@ RSpec.describe 'Kreuzberg Metadata Types' do
|
|
|
1154
1154
|
|
|
1155
1155
|
describe 'Thread Safety: Concurrent Extraction' do
|
|
1156
1156
|
it 'handles concurrent extraction safely' do
|
|
1157
|
-
test_files =
|
|
1158
|
-
results =
|
|
1159
|
-
errors = []
|
|
1157
|
+
test_files = create_concurrent_test_files
|
|
1158
|
+
results, errors = run_concurrent_extractions(test_files)
|
|
1160
1159
|
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
<html>
|
|
1164
|
-
<head>
|
|
1165
|
-
<title>Concurrent Test #{i}</title>
|
|
1166
|
-
<meta name="description" content="Test document #{i}">
|
|
1167
|
-
<meta name="keywords" content="test#{i}, concurrent, thread-safe">
|
|
1168
|
-
</head>
|
|
1169
|
-
<body>
|
|
1170
|
-
<h1>Test Document #{i}</h1>
|
|
1171
|
-
<p>Content for test #{i}</p>
|
|
1172
|
-
<a href="/page-#{i}">Link #{i}</a>
|
|
1173
|
-
<img src="image-#{i}.jpg" alt="Image #{i}">
|
|
1174
|
-
</body>
|
|
1175
|
-
</html>
|
|
1176
|
-
HTML
|
|
1177
|
-
test_files << create_test_html_file(html_content)
|
|
1178
|
-
end
|
|
1179
|
-
|
|
1180
|
-
begin
|
|
1181
|
-
threads = test_files.map do |file|
|
|
1182
|
-
Thread.new do
|
|
1183
|
-
result = Kreuzberg.extract_file_sync(path: file)
|
|
1184
|
-
results << result
|
|
1185
|
-
rescue StandardError => e
|
|
1186
|
-
errors << e
|
|
1187
|
-
end
|
|
1188
|
-
end
|
|
1189
|
-
|
|
1190
|
-
threads.each(&:join)
|
|
1191
|
-
|
|
1192
|
-
expect(errors).to be_empty
|
|
1193
|
-
|
|
1194
|
-
expect(results.length).to eq(5)
|
|
1195
|
-
results.each do |result|
|
|
1196
|
-
expect(result).to be_a(Kreuzberg::Result)
|
|
1197
|
-
expect(result.metadata).not_to be_nil
|
|
1198
|
-
|
|
1199
|
-
metadata = result.metadata
|
|
1200
|
-
next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
1201
|
-
|
|
1202
|
-
expect(metadata.title).not_to be_nil
|
|
1203
|
-
expect(metadata.description).not_to be_nil
|
|
1204
|
-
expect(metadata.keywords).to be_a(Array)
|
|
1205
|
-
expect(metadata.headers).to be_a(Array)
|
|
1206
|
-
expect(metadata.links).to be_a(Array)
|
|
1207
|
-
expect(metadata.images).to be_a(Array)
|
|
1208
|
-
end
|
|
1209
|
-
|
|
1210
|
-
titles = results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
|
|
1211
|
-
expect(titles.uniq.length).to eq(5)
|
|
1212
|
-
ensure
|
|
1213
|
-
test_files.each { |f| FileUtils.rm_f(f) }
|
|
1214
|
-
end
|
|
1160
|
+
expect(results).not_to be_empty
|
|
1161
|
+
verify_concurrent_results(results, errors, test_files)
|
|
1215
1162
|
end
|
|
1216
1163
|
end
|
|
1217
1164
|
|
|
@@ -1225,4 +1172,77 @@ RSpec.describe 'Kreuzberg Metadata Types' do
|
|
|
1225
1172
|
file.close
|
|
1226
1173
|
file.path
|
|
1227
1174
|
end
|
|
1175
|
+
|
|
1176
|
+
def create_concurrent_test_files
|
|
1177
|
+
test_files = []
|
|
1178
|
+
5.times do |i|
|
|
1179
|
+
html_content = <<~HTML
|
|
1180
|
+
<html>
|
|
1181
|
+
<head>
|
|
1182
|
+
<title>Concurrent Test #{i}</title>
|
|
1183
|
+
<meta name="description" content="Test document #{i}">
|
|
1184
|
+
<meta name="keywords" content="test#{i}, concurrent, thread-safe">
|
|
1185
|
+
</head>
|
|
1186
|
+
<body>
|
|
1187
|
+
<h1>Test Document #{i}</h1>
|
|
1188
|
+
<p>Content for test #{i}</p>
|
|
1189
|
+
<a href="/page-#{i}">Link #{i}</a>
|
|
1190
|
+
<img src="image-#{i}.jpg" alt="Image #{i}">
|
|
1191
|
+
</body>
|
|
1192
|
+
</html>
|
|
1193
|
+
HTML
|
|
1194
|
+
test_files << create_test_html_file(html_content)
|
|
1195
|
+
end
|
|
1196
|
+
test_files
|
|
1197
|
+
end
|
|
1198
|
+
|
|
1199
|
+
def run_concurrent_extractions(test_files)
|
|
1200
|
+
results = []
|
|
1201
|
+
errors = []
|
|
1202
|
+
|
|
1203
|
+
threads = test_files.map do |file|
|
|
1204
|
+
Thread.new do
|
|
1205
|
+
result = Kreuzberg.extract_file_sync(path: file)
|
|
1206
|
+
results << result
|
|
1207
|
+
rescue StandardError => e
|
|
1208
|
+
errors << e
|
|
1209
|
+
end
|
|
1210
|
+
end
|
|
1211
|
+
|
|
1212
|
+
threads.each(&:join)
|
|
1213
|
+
[results, errors]
|
|
1214
|
+
end
|
|
1215
|
+
|
|
1216
|
+
def verify_concurrent_results(results, errors, test_files)
|
|
1217
|
+
expect(errors).to be_empty
|
|
1218
|
+
expect(results.length).to eq(5)
|
|
1219
|
+
|
|
1220
|
+
results.each do |result|
|
|
1221
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
1222
|
+
expect(result.metadata).not_to be_nil
|
|
1223
|
+
|
|
1224
|
+
metadata = result.metadata
|
|
1225
|
+
next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
1226
|
+
|
|
1227
|
+
verify_metadata_fields(metadata)
|
|
1228
|
+
end
|
|
1229
|
+
|
|
1230
|
+
titles = extract_titles(results)
|
|
1231
|
+
expect(titles.uniq.length).to eq(5)
|
|
1232
|
+
ensure
|
|
1233
|
+
test_files.each { |f| FileUtils.rm_f(f) }
|
|
1234
|
+
end
|
|
1235
|
+
|
|
1236
|
+
def verify_metadata_fields(metadata)
|
|
1237
|
+
expect(metadata.title).not_to be_nil
|
|
1238
|
+
expect(metadata.description).not_to be_nil
|
|
1239
|
+
expect(metadata.keywords).to be_a(Array)
|
|
1240
|
+
expect(metadata.headers).to be_a(Array)
|
|
1241
|
+
expect(metadata.links).to be_a(Array)
|
|
1242
|
+
expect(metadata.images).to be_a(Array)
|
|
1243
|
+
end
|
|
1244
|
+
|
|
1245
|
+
def extract_titles(results)
|
|
1246
|
+
results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
|
|
1247
|
+
end
|
|
1228
1248
|
end
|
data/spec/binding/tables_spec.rb
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'spec_helper'
|
|
4
|
+
require 'tempfile'
|
|
5
|
+
require 'fileutils'
|
|
4
6
|
|
|
5
7
|
RSpec.describe 'Table Extraction Quality' do
|
|
6
8
|
describe 'table structure extraction' do
|
|
@@ -523,12 +525,19 @@ RSpec.describe 'Table Extraction Quality' do
|
|
|
523
525
|
it 'handles documents with no tables gracefully' do
|
|
524
526
|
config = Kreuzberg::Config::Extraction.new
|
|
525
527
|
|
|
528
|
+
# Create a temporary text file for this test
|
|
529
|
+
file = Tempfile.new(['no_tables_test', '.txt'])
|
|
530
|
+
file.write('This is a text document without any tables.')
|
|
531
|
+
file.close
|
|
532
|
+
|
|
526
533
|
begin
|
|
527
|
-
result = Kreuzberg.extract_file(path:
|
|
534
|
+
result = Kreuzberg.extract_file(path: file.path, config: config)
|
|
528
535
|
expect(result).not_to be_nil
|
|
529
536
|
expect(result.tables).to be_a(Array) if result.tables
|
|
530
|
-
rescue Kreuzberg::Errors::
|
|
537
|
+
rescue Kreuzberg::Errors::IOError
|
|
531
538
|
skip 'Text file not available for testing'
|
|
539
|
+
ensure
|
|
540
|
+
FileUtils.rm_f(file.path)
|
|
532
541
|
end
|
|
533
542
|
end
|
|
534
543
|
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Cross-language serialization tests for Ruby bindings
|
|
4
|
+
#
|
|
5
|
+
# Validates that ExtractionConfig serializes consistently with other language bindings
|
|
6
|
+
|
|
7
|
+
require 'json'
|
|
8
|
+
require 'spec_helper'
|
|
9
|
+
|
|
10
|
+
RSpec.describe Kreuzberg::ExtractionConfig do
|
|
11
|
+
describe '#to_h' do
|
|
12
|
+
it 'serializes minimal config to hash' do
|
|
13
|
+
config = described_class.new
|
|
14
|
+
hash = config.to_h
|
|
15
|
+
|
|
16
|
+
expect(hash).to be_a(Hash)
|
|
17
|
+
expect(hash).to have_key(:use_cache)
|
|
18
|
+
expect(hash).to have_key(:enable_quality_processing)
|
|
19
|
+
expect(hash).to have_key(:force_ocr)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it 'serializes config with all fields' do
|
|
23
|
+
config = described_class.new(
|
|
24
|
+
use_cache: true,
|
|
25
|
+
enable_quality_processing: true,
|
|
26
|
+
force_ocr: false
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
hash = config.to_h
|
|
30
|
+
|
|
31
|
+
expect(hash[:use_cache]).to be(true)
|
|
32
|
+
expect(hash[:enable_quality_processing]).to be(true)
|
|
33
|
+
expect(hash[:force_ocr]).to be(false)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it 'preserves field values after serialization' do
|
|
37
|
+
original = described_class.new(
|
|
38
|
+
use_cache: false,
|
|
39
|
+
enable_quality_processing: true
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
hash = original.to_h
|
|
43
|
+
|
|
44
|
+
expect(hash[:use_cache]).to be(false)
|
|
45
|
+
expect(hash[:enable_quality_processing]).to be(true)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
describe '#to_json' do
|
|
50
|
+
it 'serializes to JSON' do
|
|
51
|
+
config = described_class.new(use_cache: true)
|
|
52
|
+
json = config.to_json
|
|
53
|
+
|
|
54
|
+
expect(json).to be_a(String)
|
|
55
|
+
|
|
56
|
+
parsed = JSON.parse(json, symbolize_names: true)
|
|
57
|
+
expect(parsed).to have_key(:use_cache)
|
|
58
|
+
expect(parsed[:use_cache]).to be(true)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
it 'produces valid JSON' do
|
|
62
|
+
config = described_class.new
|
|
63
|
+
json = config.to_json
|
|
64
|
+
|
|
65
|
+
expect { JSON.parse(json) }.not_to raise_error
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
it 'uses snake_case field names' do
|
|
69
|
+
config = described_class.new(use_cache: true)
|
|
70
|
+
json = config.to_json
|
|
71
|
+
|
|
72
|
+
expect(json).to include('use_cache')
|
|
73
|
+
expect(json).not_to include('useCache')
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
describe 'round-trip serialization' do
|
|
78
|
+
it 'survives serialization -> deserialization -> serialization' do
|
|
79
|
+
config1 = described_class.new(
|
|
80
|
+
use_cache: true,
|
|
81
|
+
enable_quality_processing: false
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
json1 = config1.to_json
|
|
85
|
+
hash1 = JSON.parse(json1, symbolize_names: true)
|
|
86
|
+
|
|
87
|
+
config2 = described_class.new(hash1)
|
|
88
|
+
json2 = config2.to_json
|
|
89
|
+
|
|
90
|
+
# JSON strings should be equivalent
|
|
91
|
+
expect(JSON.parse(json1)).to eq(JSON.parse(json2))
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
describe 'field consistency' do
|
|
96
|
+
it 'includes all mandatory fields' do
|
|
97
|
+
config = described_class.new
|
|
98
|
+
hash = config.to_h
|
|
99
|
+
|
|
100
|
+
mandatory_fields = %i[use_cache enable_quality_processing force_ocr]
|
|
101
|
+
mandatory_fields.each do |field|
|
|
102
|
+
expect(hash).to have_key(field)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it 'handles nested ocr config' do
|
|
107
|
+
config = described_class.new(
|
|
108
|
+
ocr: {
|
|
109
|
+
backend: 'tesseract',
|
|
110
|
+
language: 'eng'
|
|
111
|
+
}
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
hash = config.to_h
|
|
115
|
+
|
|
116
|
+
expect(hash).to have_key(:ocr)
|
|
117
|
+
expect(hash[:ocr][:backend]).to eq('tesseract')
|
|
118
|
+
expect(hash[:ocr][:language]).to eq('eng')
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
describe 'immutability' do
|
|
123
|
+
it 'does not modify original config during serialization' do
|
|
124
|
+
config = described_class.new(use_cache: true)
|
|
125
|
+
|
|
126
|
+
json1 = config.to_json
|
|
127
|
+
json2 = config.to_json
|
|
128
|
+
json3 = config.to_json
|
|
129
|
+
|
|
130
|
+
expect(json1).to eq(json2)
|
|
131
|
+
expect(json2).to eq(json3)
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# rubocop:disable RSpec/RepeatedExample
|
|
4
|
+
RSpec.describe 'Output Format and Result Format Configuration' do
|
|
5
|
+
describe Kreuzberg::Config::Extraction do
|
|
6
|
+
describe 'output_format' do
|
|
7
|
+
it 'accepts output_format as initialization parameter' do
|
|
8
|
+
config = described_class.new(output_format: 'markdown')
|
|
9
|
+
|
|
10
|
+
expect(config.output_format).to eq 'markdown'
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it 'defaults to nil when not specified' do
|
|
14
|
+
config = described_class.new
|
|
15
|
+
|
|
16
|
+
expect(config.output_format).to be_nil
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'accepts plain format' do
|
|
20
|
+
config = described_class.new(output_format: 'plain')
|
|
21
|
+
|
|
22
|
+
expect(config.output_format).to eq 'plain'
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'accepts markdown format' do
|
|
26
|
+
config = described_class.new(output_format: 'markdown')
|
|
27
|
+
|
|
28
|
+
expect(config.output_format).to eq 'markdown'
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it 'accepts djot format' do
|
|
32
|
+
config = described_class.new(output_format: 'djot')
|
|
33
|
+
|
|
34
|
+
expect(config.output_format).to eq 'djot'
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'accepts html format' do
|
|
38
|
+
config = described_class.new(output_format: 'html')
|
|
39
|
+
|
|
40
|
+
expect(config.output_format).to eq 'html'
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it 'converts output_format to string' do
|
|
44
|
+
config = described_class.new(output_format: :markdown)
|
|
45
|
+
|
|
46
|
+
expect(config.output_format).to eq 'markdown'
|
|
47
|
+
expect(config.output_format).to be_a String
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
it 'includes output_format in to_h' do
|
|
51
|
+
config = described_class.new(output_format: 'markdown')
|
|
52
|
+
hash = config.to_h
|
|
53
|
+
|
|
54
|
+
expect(hash[:output_format]).to eq 'markdown'
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
it 'excludes nil output_format from to_h' do
|
|
58
|
+
config = described_class.new(output_format: nil)
|
|
59
|
+
hash = config.to_h
|
|
60
|
+
|
|
61
|
+
expect(hash.key?(:output_format)).to be false
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it 'includes output_format in JSON' do
|
|
65
|
+
config = described_class.new(output_format: 'markdown')
|
|
66
|
+
json = config.to_json
|
|
67
|
+
parsed = JSON.parse(json)
|
|
68
|
+
|
|
69
|
+
expect(parsed['output_format']).to eq 'markdown'
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
it 'retrieves output_format with get_field' do
|
|
73
|
+
config = described_class.new(output_format: 'djot')
|
|
74
|
+
|
|
75
|
+
expect(config.get_field('output_format')).to eq 'djot'
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it 'can be set with []=' do
|
|
79
|
+
config = described_class.new
|
|
80
|
+
config[:output_format] = 'html'
|
|
81
|
+
|
|
82
|
+
expect(config.output_format).to eq 'html'
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it 'can be set with []= using symbol' do
|
|
86
|
+
config = described_class.new
|
|
87
|
+
config[:output_format] = :plain
|
|
88
|
+
|
|
89
|
+
expect(config.output_format).to eq 'plain'
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it 'can be retrieved with []' do
|
|
93
|
+
config = described_class.new(output_format: 'markdown')
|
|
94
|
+
|
|
95
|
+
expect(config[:output_format]).to eq 'markdown'
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
describe 'result_format' do
|
|
100
|
+
it 'accepts result_format as initialization parameter' do
|
|
101
|
+
config = described_class.new(result_format: 'unified')
|
|
102
|
+
|
|
103
|
+
expect(config.result_format).to eq 'unified'
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it 'defaults to nil when not specified' do
|
|
107
|
+
config = described_class.new
|
|
108
|
+
|
|
109
|
+
expect(config.result_format).to be_nil
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
it 'accepts unified format' do
|
|
113
|
+
config = described_class.new(result_format: 'unified')
|
|
114
|
+
|
|
115
|
+
expect(config.result_format).to eq 'unified'
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
it 'accepts element_based format' do
|
|
119
|
+
config = described_class.new(result_format: 'element_based')
|
|
120
|
+
|
|
121
|
+
expect(config.result_format).to eq 'element_based'
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
it 'converts result_format to string' do
|
|
125
|
+
config = described_class.new(result_format: :unified)
|
|
126
|
+
|
|
127
|
+
expect(config.result_format).to eq 'unified'
|
|
128
|
+
expect(config.result_format).to be_a String
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it 'includes result_format in to_h' do
|
|
132
|
+
config = described_class.new(result_format: 'element_based')
|
|
133
|
+
hash = config.to_h
|
|
134
|
+
|
|
135
|
+
expect(hash[:result_format]).to eq 'element_based'
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it 'excludes nil result_format from to_h' do
|
|
139
|
+
config = described_class.new(result_format: nil)
|
|
140
|
+
hash = config.to_h
|
|
141
|
+
|
|
142
|
+
expect(hash.key?(:result_format)).to be false
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
it 'includes result_format in JSON' do
|
|
146
|
+
config = described_class.new(result_format: 'element_based')
|
|
147
|
+
json = config.to_json
|
|
148
|
+
parsed = JSON.parse(json)
|
|
149
|
+
|
|
150
|
+
expect(parsed['result_format']).to eq 'element_based'
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it 'retrieves result_format with get_field' do
|
|
154
|
+
config = described_class.new(result_format: 'unified')
|
|
155
|
+
|
|
156
|
+
expect(config.get_field('result_format')).to eq 'unified'
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it 'can be set with []=' do
|
|
160
|
+
config = described_class.new
|
|
161
|
+
config[:result_format] = 'unified'
|
|
162
|
+
|
|
163
|
+
expect(config.result_format).to eq 'unified'
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
it 'can be set with []= using symbol' do
|
|
167
|
+
config = described_class.new
|
|
168
|
+
config[:result_format] = :element_based
|
|
169
|
+
|
|
170
|
+
expect(config.result_format).to eq 'element_based'
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
it 'can be retrieved with []' do
|
|
174
|
+
config = described_class.new(result_format: 'element_based')
|
|
175
|
+
|
|
176
|
+
expect(config[:result_format]).to eq 'element_based'
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
describe 'combined output and result formats' do
|
|
181
|
+
it 'accepts both output_format and result_format' do
|
|
182
|
+
config = described_class.new(
|
|
183
|
+
output_format: 'markdown',
|
|
184
|
+
result_format: 'unified'
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
expect(config.output_format).to eq 'markdown'
|
|
188
|
+
expect(config.result_format).to eq 'unified'
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
it 'serializes both formats in to_h' do
|
|
192
|
+
config = described_class.new(
|
|
193
|
+
output_format: 'djot',
|
|
194
|
+
result_format: 'element_based'
|
|
195
|
+
)
|
|
196
|
+
hash = config.to_h
|
|
197
|
+
|
|
198
|
+
expect(hash[:output_format]).to eq 'djot'
|
|
199
|
+
expect(hash[:result_format]).to eq 'element_based'
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
it 'serializes both formats in JSON' do
|
|
203
|
+
config = described_class.new(
|
|
204
|
+
output_format: 'html',
|
|
205
|
+
result_format: 'unified'
|
|
206
|
+
)
|
|
207
|
+
json = config.to_json
|
|
208
|
+
parsed = JSON.parse(json)
|
|
209
|
+
|
|
210
|
+
expect(parsed['output_format']).to eq 'html'
|
|
211
|
+
expect(parsed['result_format']).to eq 'unified'
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
it 'merges both formats correctly' do
|
|
215
|
+
base = described_class.new(
|
|
216
|
+
output_format: 'markdown',
|
|
217
|
+
result_format: 'unified'
|
|
218
|
+
)
|
|
219
|
+
override = described_class.new(output_format: 'html')
|
|
220
|
+
merged = base.merge(override)
|
|
221
|
+
|
|
222
|
+
expect(merged.output_format).to eq 'html'
|
|
223
|
+
expect(merged.result_format).to eq 'unified'
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
it 'merges both formats with merge!' do
|
|
227
|
+
config = described_class.new(
|
|
228
|
+
output_format: 'markdown',
|
|
229
|
+
result_format: 'unified'
|
|
230
|
+
)
|
|
231
|
+
override = described_class.new(
|
|
232
|
+
output_format: 'djot',
|
|
233
|
+
result_format: 'element_based'
|
|
234
|
+
)
|
|
235
|
+
config.merge!(override)
|
|
236
|
+
|
|
237
|
+
expect(config.output_format).to eq 'djot'
|
|
238
|
+
expect(config.result_format).to eq 'element_based'
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
it 'handles merge with hash containing both formats' do
|
|
242
|
+
config = described_class.new(
|
|
243
|
+
output_format: 'plain',
|
|
244
|
+
result_format: 'unified'
|
|
245
|
+
)
|
|
246
|
+
merged = config.merge({ output_format: 'markdown' })
|
|
247
|
+
|
|
248
|
+
expect(merged.output_format).to eq 'markdown'
|
|
249
|
+
expect(merged.result_format).to eq 'unified'
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
describe 'format persistence across operations' do
|
|
254
|
+
it 'persists output_format through multiple conversions' do
|
|
255
|
+
config = described_class.new(output_format: 'markdown')
|
|
256
|
+
hash = config.to_h
|
|
257
|
+
new_config = described_class.new(**hash)
|
|
258
|
+
|
|
259
|
+
expect(new_config.output_format).to eq 'markdown'
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
it 'persists result_format through multiple conversions' do
|
|
263
|
+
config = described_class.new(result_format: 'element_based')
|
|
264
|
+
hash = config.to_h
|
|
265
|
+
new_config = described_class.new(**hash)
|
|
266
|
+
|
|
267
|
+
expect(new_config.result_format).to eq 'element_based'
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
it 'round-trips through JSON' do
|
|
271
|
+
config = described_class.new(
|
|
272
|
+
output_format: 'djot',
|
|
273
|
+
result_format: 'unified'
|
|
274
|
+
)
|
|
275
|
+
json = config.to_json
|
|
276
|
+
parsed = JSON.parse(json)
|
|
277
|
+
new_config = described_class.new(**parsed.transform_keys(&:to_sym))
|
|
278
|
+
|
|
279
|
+
expect(new_config.output_format).to eq 'djot'
|
|
280
|
+
expect(new_config.result_format).to eq 'unified'
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
describe 'format validation and edge cases' do
|
|
285
|
+
it 'raises error for empty string output_format' do
|
|
286
|
+
expect do
|
|
287
|
+
described_class.new(output_format: '')
|
|
288
|
+
end.to raise_error(ArgumentError, /Invalid output_format/)
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
it 'raises error for empty string result_format' do
|
|
292
|
+
expect do
|
|
293
|
+
described_class.new(result_format: '')
|
|
294
|
+
end.to raise_error(ArgumentError, /Invalid result_format/)
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
it 'raises error for whitespace in output_format' do
|
|
298
|
+
expect do
|
|
299
|
+
described_class.new(output_format: ' plain ')
|
|
300
|
+
end.to raise_error(ArgumentError, /Invalid output_format/)
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
it 'normalizes case in output_format' do
|
|
304
|
+
config = described_class.new(output_format: 'MarkDown')
|
|
305
|
+
|
|
306
|
+
expect(config.output_format).to eq 'markdown'
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
it 'raises error for custom string in result_format' do
|
|
310
|
+
expect do
|
|
311
|
+
described_class.new(result_format: 'custom_format')
|
|
312
|
+
end.to raise_error(ArgumentError, /Invalid result_format/)
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
describe 'integration with other config fields' do
|
|
317
|
+
it 'works with output_format and chunking together' do
|
|
318
|
+
config = described_class.new(
|
|
319
|
+
output_format: 'markdown',
|
|
320
|
+
chunking: { max_chars: 500 }
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
expect(config.output_format).to eq 'markdown'
|
|
324
|
+
expect(config.chunking.max_chars).to eq 500
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
it 'works with result_format and OCR together' do
|
|
328
|
+
config = described_class.new(
|
|
329
|
+
result_format: 'element_based',
|
|
330
|
+
ocr: { backend: 'tesseract' }
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
expect(config.result_format).to eq 'element_based'
|
|
334
|
+
expect(config.ocr.backend).to eq 'tesseract'
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
it 'works with both formats and language detection' do
|
|
338
|
+
config = described_class.new(
|
|
339
|
+
output_format: 'html',
|
|
340
|
+
result_format: 'unified',
|
|
341
|
+
language_detection: { enabled: true }
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
expect(config.output_format).to eq 'html'
|
|
345
|
+
expect(config.result_format).to eq 'unified'
|
|
346
|
+
expect(config.language_detection.enabled).to be true
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
it 'preserves formats in complex config merge' do
|
|
350
|
+
base = described_class.new(
|
|
351
|
+
output_format: 'markdown',
|
|
352
|
+
result_format: 'unified',
|
|
353
|
+
chunking: { max_chars: 500 },
|
|
354
|
+
ocr: { backend: 'tesseract' }
|
|
355
|
+
)
|
|
356
|
+
override = described_class.new(
|
|
357
|
+
output_format: 'djot',
|
|
358
|
+
chunking: { max_chars: 750 }
|
|
359
|
+
)
|
|
360
|
+
merged = base.merge(override)
|
|
361
|
+
|
|
362
|
+
expect(merged.output_format).to eq 'djot'
|
|
363
|
+
expect(merged.result_format).to eq 'unified'
|
|
364
|
+
expect(merged.chunking.max_chars).to eq 750
|
|
365
|
+
expect(merged.ocr.backend).to eq 'tesseract'
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
describe 'allowed keys integration' do
|
|
370
|
+
it 'includes output_format in ALLOWED_KEYS' do
|
|
371
|
+
expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:output_format)
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
it 'includes result_format in ALLOWED_KEYS' do
|
|
375
|
+
expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:result_format)
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
end
|
|
380
|
+
# rubocop:enable RSpec/RepeatedExample
|
data/vendor/Cargo.toml
CHANGED